ggml.c 241 KB
Newer Older
xuxzh1's avatar
update  
xuxzh1 committed
1
#define _CRT_SECURE_NO_DEPRECATE // Disables "unsafe" warnings on Windows
xuxzh1's avatar
init  
xuxzh1 committed
2
3
#define _USE_MATH_DEFINES // For M_PI on MSVC

xuxzh1's avatar
update  
xuxzh1 committed
4
#include "ggml-backend.h"
xuxzh1's avatar
init  
xuxzh1 committed
5
#include "ggml-impl.h"
xuxzh1's avatar
update  
xuxzh1 committed
6
#include "ggml-threading.h"
xuxzh1's avatar
init  
xuxzh1 committed
7
#include "ggml.h"
xuxzh1's avatar
update  
xuxzh1 committed
8
9
10

// FIXME: required here for quantization functions
#include "ggml-quants.h"
xuxzh1's avatar
init  
xuxzh1 committed
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#include "ggml-aarch64.h"

#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
#include <alloca.h>
#endif

#include <assert.h>
#include <errno.h>
#include <time.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <inttypes.h>
#include <stdio.h>
#include <float.h>
#include <limits.h>
#include <stdarg.h>
#include <signal.h>
#if defined(__gnu_linux__)
#include <syscall.h>
#endif

xuxzh1's avatar
update  
xuxzh1 committed
36
#if defined(__APPLE__)
xuxzh1's avatar
init  
xuxzh1 committed
37
#include <unistd.h>
xuxzh1's avatar
update  
xuxzh1 committed
38
39
#include <mach/mach.h>
#include <TargetConditionals.h>
xuxzh1's avatar
init  
xuxzh1 committed
40
41
42
43
44
45
46
47
48
49
#endif

#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
    #define NOMINMAX
#endif
#include <windows.h>
#endif

xuxzh1's avatar
update  
xuxzh1 committed
50
#define UNUSED GGML_UNUSED
xuxzh1's avatar
init  
xuxzh1 committed
51

xuxzh1's avatar
update  
xuxzh1 committed
52
53
54
55
56
57
#if defined(_MSC_VER)
#define m512bh(p) p
#define m512i(p) p
#else
#define m512bh(p) (__m512bh)(p)
#define m512i(p) (__m512i)(p)
xuxzh1's avatar
init  
xuxzh1 committed
58
59
#endif

xuxzh1's avatar
update  
xuxzh1 committed
60
61
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
float ggml_table_f32_f16[1 << 16];
xuxzh1's avatar
init  
xuxzh1 committed
62
63
64

#if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
    (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))
xuxzh1's avatar
update  
xuxzh1 committed
65
66
67
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
xuxzh1's avatar
init  
xuxzh1 committed
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#include <sys/wait.h>

#if defined(__ANDROID__)
#include <unwind.h>
#include <dlfcn.h>
#include <stdio.h>

struct backtrace_state {
    void ** current;
    void ** end;
};

static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) {
    struct backtrace_state * state = (struct backtrace_state *)arg;
    uintptr_t pc = _Unwind_GetIP(context);
    if (pc) {
        if (state->current == state->end) {
            return _URC_END_OF_STACK;
        } else {
            *state->current++ = (void*)pc;
        }
    }
    return _URC_NO_REASON;
}

static void ggml_print_backtrace_symbols(void) {
    const int max = 100;
    void* buffer[max];

    struct backtrace_state state = {buffer, buffer + max};
    _Unwind_Backtrace(unwind_callback, &state);

    int count = state.current - buffer;

    for (int idx = 0; idx < count; ++idx) {
        const void * addr = buffer[idx];
        const char * symbol = "";

        Dl_info info;
        if (dladdr(addr, &info) && info.dli_sname) {
            symbol = info.dli_sname;
        }

        fprintf(stderr, "%d: %p %s\n", idx, addr, symbol);
    }
}
#elif defined(__linux__) && defined(__GLIBC__)
#include <execinfo.h>
static void ggml_print_backtrace_symbols(void) {
    void * trace[100];
    int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
    backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
}
#else
static void ggml_print_backtrace_symbols(void) {
    // platform not supported
}
#endif

static void ggml_print_backtrace(void) {
    char attach[32];
    snprintf(attach, sizeof(attach), "attach %d", getpid());
    int pid = fork();
    if (pid == 0) {
        // try gdb
        execlp("gdb", "gdb", "--batch",
            "-ex", "set style enabled on",
            "-ex", attach,
            "-ex", "bt -frame-info source-and-location",
            "-ex", "detach",
            "-ex", "quit",
            (char *) NULL);
        // try lldb
        execlp("lldb", "lldb", "--batch",
            "-o", "bt",
            "-o", "quit",
            "-p", attach,
            (char *) NULL);
        exit(EXIT_FAILURE);
    } else {
        int wstatus;
        waitpid(pid, &wstatus, 0);
        if (WIFEXITED(wstatus)) {
            if (WEXITSTATUS(wstatus) == EXIT_FAILURE) {
                // gdb failed, fallback to backtrace_symbols
                ggml_print_backtrace_symbols();
            }
        }
    }
}
#else
static void ggml_print_backtrace(void) {
    // platform not supported
}
#endif

void ggml_abort(const char * file, int line, const char * fmt, ...) {
    fflush(stdout);

    fprintf(stderr, "%s:%d: ", file, line);

    va_list args;
    va_start(args, fmt);
    vfprintf(stderr, fmt, args);
    va_end(args);

    fprintf(stderr, "\n");

    ggml_print_backtrace();
    abort();
}

//
// logging
//

xuxzh1's avatar
update  
xuxzh1 committed
184
185
186
187
188
struct ggml_logger_state {
    ggml_log_callback log_callback;
    void * log_callback_user_data;
};
static struct ggml_logger_state g_logger_state = {ggml_log_callback_default, NULL};
xuxzh1's avatar
init  
xuxzh1 committed
189

xuxzh1's avatar
update  
xuxzh1 committed
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
static void ggml_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
    if (format == NULL) {
        return;
    }
    va_list args_copy;
    va_copy(args_copy, args);
    char buffer[128];
    int len = vsnprintf(buffer, 128, format, args);
    if (len < 128) {
        g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
    } else {
        char * buffer2 = (char *) calloc(len + 1, sizeof(char));
        vsnprintf(buffer2, len + 1, format, args_copy);
        buffer2[len] = 0;
        g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
        free(buffer2);
    }
    va_end(args_copy);
}
xuxzh1's avatar
init  
xuxzh1 committed
209

xuxzh1's avatar
update  
xuxzh1 committed
210
211
212
213
214
215
void ggml_log_internal(enum ggml_log_level level, const char * format, ...) {
    va_list args;
    va_start(args, format);
    ggml_log_internal_v(level, format, args);
    va_end(args);
}
xuxzh1's avatar
init  
xuxzh1 committed
216

xuxzh1's avatar
update  
xuxzh1 committed
217
218
219
220
221
222
void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) {
    (void) level;
    (void) user_data;
    fputs(text, stderr);
    fflush(stderr);
}
xuxzh1's avatar
init  
xuxzh1 committed
223
224
225
226
227
228
229
230
231
232
233

//
// end of logging block
//

#ifdef GGML_USE_ACCELERATE
// uncomment to use vDSP for soft max computation
// note: not sure if it is actually faster
//#define GGML_SOFT_MAX_ACCELERATE
#endif

xuxzh1's avatar
update  
xuxzh1 committed
234
235
236
237

void * ggml_aligned_malloc(size_t size) {
    const int alignment = 64;

xuxzh1's avatar
init  
xuxzh1 committed
238
#if defined(_MSC_VER) || defined(__MINGW32__)
xuxzh1's avatar
update  
xuxzh1 committed
239
    return _aligned_malloc(size, alignment);
xuxzh1's avatar
init  
xuxzh1 committed
240
241
#else
    if (size == 0) {
xuxzh1's avatar
update  
xuxzh1 committed
242
        GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
xuxzh1's avatar
init  
xuxzh1 committed
243
244
245
        return NULL;
    }
    void * aligned_memory = NULL;
xuxzh1's avatar
update  
xuxzh1 committed
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
  #ifdef GGML_USE_CPU_HBM
    int result = hbw_posix_memalign(&aligned_memory, alignment, size);
  #elif TARGET_OS_OSX
    GGML_UNUSED(alignment);
    kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE);
    int result = EFAULT;
    switch (alloc_status) {
        case KERN_SUCCESS:
            result = 0;
            break;
        case KERN_INVALID_ADDRESS:
            result = EINVAL;
            break;
        case KERN_NO_SPACE:
            result = ENOMEM;
            break;
        default:
            result = EFAULT;
            break;
    }
  #else
    int result = posix_memalign(&aligned_memory, alignment, size);
  #endif
xuxzh1's avatar
init  
xuxzh1 committed
269
270
271
272
273
274
275
276
277
278
279
    if (result != 0) {
        // Handle allocation failure
        const char *error_desc = "unknown allocation error";
        switch (result) {
            case EINVAL:
                error_desc = "invalid alignment value";
                break;
            case ENOMEM:
                error_desc = "insufficient memory";
                break;
        }
xuxzh1's avatar
update  
xuxzh1 committed
280
        GGML_LOG_ERROR("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
xuxzh1's avatar
init  
xuxzh1 committed
281
282
283
        return NULL;
    }
    return aligned_memory;
xuxzh1's avatar
update  
xuxzh1 committed
284
#endif
xuxzh1's avatar
init  
xuxzh1 committed
285
}
xuxzh1's avatar
update  
xuxzh1 committed
286
287
288
289
290
291
292
293
294
295
296
297
298

void ggml_aligned_free(void * ptr, size_t size) {
    GGML_UNUSED(size);
#if defined(_MSC_VER) || defined(__MINGW32__)
    _aligned_free(ptr);
#elif GGML_USE_CPU_HBM
    if (ptr != NULL) {
        hbw_free(ptr);
    }
#elif TARGET_OS_OSX
    if (ptr != NULL) {
        vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size);
    }
xuxzh1's avatar
init  
xuxzh1 committed
299
#else
xuxzh1's avatar
update  
xuxzh1 committed
300
    free(ptr);
xuxzh1's avatar
init  
xuxzh1 committed
301
#endif
xuxzh1's avatar
update  
xuxzh1 committed
302
303
}

xuxzh1's avatar
init  
xuxzh1 committed
304
305
306

inline static void * ggml_malloc(size_t size) {
    if (size == 0) {
xuxzh1's avatar
update  
xuxzh1 committed
307
        GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
xuxzh1's avatar
init  
xuxzh1 committed
308
309
310
311
        return NULL;
    }
    void * result = malloc(size);
    if (result == NULL) {
xuxzh1's avatar
update  
xuxzh1 committed
312
        GGML_LOG_ERROR("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
xuxzh1's avatar
init  
xuxzh1 committed
313
314
315
316
317
318
319
320
        GGML_ABORT("fatal error");
    }
    return result;
}

// calloc
inline static void * ggml_calloc(size_t num, size_t size) {
    if (num == 0 || size == 0) {
xuxzh1's avatar
update  
xuxzh1 committed
321
        GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
xuxzh1's avatar
init  
xuxzh1 committed
322
323
324
325
        return NULL;
    }
    void * result = calloc(num, size);
    if (result == NULL) {
xuxzh1's avatar
update  
xuxzh1 committed
326
        GGML_LOG_ERROR("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
xuxzh1's avatar
init  
xuxzh1 committed
327
328
329
330
331
332
333
334
335
336
        GGML_ABORT("fatal error");
    }
    return result;
}

#define GGML_MALLOC(size)      ggml_malloc(size)
#define GGML_CALLOC(num, size) ggml_calloc(num, size)

#define GGML_FREE(ptr) free(ptr)

xuxzh1's avatar
update  
xuxzh1 committed
337
const char * ggml_status_to_string(enum ggml_status status) {
xuxzh1's avatar
init  
xuxzh1 committed
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
    switch (status) {
        case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
        case GGML_STATUS_FAILED:       return "GGML status: error (operation failed)";
        case GGML_STATUS_SUCCESS:      return "GGML status: success";
        case GGML_STATUS_ABORTED:      return "GGML status: warning (operation aborted)";
    }

    return "GGML status: unknown";
}

float ggml_fp16_to_fp32(ggml_fp16_t x) {
#define ggml_fp16_to_fp32 do_not_use__ggml_fp16_to_fp32__in_ggml
    return GGML_FP16_TO_FP32(x);
}

ggml_fp16_t ggml_fp32_to_fp16(float x) {
#define ggml_fp32_to_fp16 do_not_use__ggml_fp32_to_fp16__in_ggml
    return GGML_FP32_TO_FP16(x);
}

float ggml_bf16_to_fp32(ggml_bf16_t x) {
#define ggml_bf16_to_fp32 do_not_use__ggml_bf16_to_fp32__in_ggml
    return GGML_BF16_TO_FP32(x);  // it just left shifts
}

ggml_bf16_t ggml_fp32_to_bf16(float x) {
#define ggml_fp32_to_bf16 do_not_use__ggml_fp32_to_bf16__in_ggml
    return GGML_FP32_TO_BF16(x);
}

void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
    for (int64_t i = 0; i < n; i++) {
        y[i] = GGML_FP16_TO_FP32(x[i]);
    }
}

xuxzh1's avatar
update  
xuxzh1 committed
374
375
// FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library
//        currently, the ggml_cpu_has_* functions are entirely compile-time
xuxzh1's avatar
init  
xuxzh1 committed
376
377
378
void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
    int64_t i = 0;
#if defined(__F16C__)
xuxzh1's avatar
update  
xuxzh1 committed
379
380
381
382
383
384
385
386
387
388
389
390
    //if (ggml_cpu_has_f16c()) {
        for (; i + 7 < n; i += 8) {
            __m256 x_vec = _mm256_loadu_ps(x + i);
            __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
            _mm_storeu_si128((__m128i *)(y + i), y_vec);
        }
        for(; i + 3 < n; i += 4) {
            __m128 x_vec = _mm_loadu_ps(x + i);
            __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
            _mm_storel_epi64((__m128i *)(y + i), y_vec);
        }
    //}
xuxzh1's avatar
init  
xuxzh1 committed
391
392
393
394
395
396
397
398
399
#endif
    for (; i < n; i++) {
        y[i] = GGML_FP32_TO_FP16(x[i]);
    }
}

void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
    int64_t i = 0;
#if defined(__AVX512F__)
xuxzh1's avatar
update  
xuxzh1 committed
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
    //if (ggml_cpu_has_avx512()) {
        for (; i + 16 <= n; i += 16) {
            _mm512_storeu_ps(y + i,
                            _mm512_castsi512_ps(
                                _mm512_slli_epi32(
                                    _mm512_cvtepu16_epi32(
                                        _mm256_loadu_si256(
                                            (const __m256i *)(x + i))),
                                    16)));
        }
    //}
#endif
#if defined(__AVX2__)
    //if (ggml_cpu_has_avx2()) {
        for (; i + 8 <= n; i += 8) {
            _mm256_storeu_ps(y + i,
                            _mm256_castsi256_ps(
                                _mm256_slli_epi32(
                                    _mm256_cvtepu16_epi32(
                                        _mm_loadu_si128(
                                            (const __m128i *)(x + i))),
                                    16)));
        }
    //}
xuxzh1's avatar
init  
xuxzh1 committed
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
#endif
    for (; i < n; i++) {
        y[i] = GGML_BF16_TO_FP32(x[i]);
    }
}

void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) {
    for (int i = 0; i < n; i++) {
        y[i] = ggml_compute_fp32_to_bf16(x[i]);
    }
}

void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) {
  int i = 0;
#if defined(__AVX512BF16__)
  // subnormals are flushed to zero on this platform
  for (; i + 32 <= n; i += 32) {
        _mm512_storeu_si512(
            (__m512i *)(y + i),
            m512i(_mm512_cvtne2ps_pbh(_mm512_loadu_ps(x + i + 16),
                                _mm512_loadu_ps(x + i))));
  }
#endif
    for (; i < n; i++) {
        y[i] = GGML_FP32_TO_BF16(x[i]);
    }
}

bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) {
    return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0;
}

//
// timing
//

#if defined(_MSC_VER) || defined(__MINGW32__)
static int64_t timer_freq, timer_start;
void ggml_time_init(void) {
    LARGE_INTEGER t;
    QueryPerformanceFrequency(&t);
    timer_freq = t.QuadPart;

    // The multiplication by 1000 or 1000000 below can cause an overflow if timer_freq
    // and the uptime is high enough.
    // We subtract the program start time to reduce the likelihood of that happening.
    QueryPerformanceCounter(&t);
    timer_start = t.QuadPart;
}
int64_t ggml_time_ms(void) {
    LARGE_INTEGER t;
    QueryPerformanceCounter(&t);
    return ((t.QuadPart-timer_start) * 1000) / timer_freq;
}
int64_t ggml_time_us(void) {
    LARGE_INTEGER t;
    QueryPerformanceCounter(&t);
    return ((t.QuadPart-timer_start) * 1000000) / timer_freq;
}
#else
void ggml_time_init(void) {}
int64_t ggml_time_ms(void) {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
}

int64_t ggml_time_us(void) {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
}
#endif

int64_t ggml_cycles(void) {
    return clock();
}

int64_t ggml_cycles_per_ms(void) {
    return CLOCKS_PER_SEC/1000;
}

//
// cross-platform UTF-8 file paths
//

#ifdef _WIN32
static wchar_t * ggml_mbstowcs(const char * mbs) {
    int wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, NULL, 0);
    if (!wlen) {
        errno = EINVAL;
        return NULL;
    }

    wchar_t * wbuf = GGML_MALLOC(wlen * sizeof(wchar_t));
    wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, wbuf, wlen);
    if (!wlen) {
        GGML_FREE(wbuf);
        errno = EINVAL;
        return NULL;
    }

    return wbuf;
}
#endif

FILE * ggml_fopen(const char * fname, const char * mode) {
#ifdef _WIN32
    FILE * file = NULL;

    // convert fname (UTF-8)
    wchar_t * wfname = ggml_mbstowcs(fname);
    if (wfname) {
        // convert mode (ANSI)
        wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t));
        wchar_t * wmode_p = wmode;
        do {
            *wmode_p++ = (wchar_t)*mode;
        } while (*mode++);

        // open file
        file = _wfopen(wfname, wmode);

        GGML_FREE(wfname);
        GGML_FREE(wmode);
    }

    return file;
#else
    return fopen(fname, mode);
#endif

xuxzh1's avatar
update  
xuxzh1 committed
556
}
xuxzh1's avatar
init  
xuxzh1 committed
557
558
559
560
static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);

xuxzh1's avatar
update  
xuxzh1 committed
561
static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
xuxzh1's avatar
init  
xuxzh1 committed
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
    [GGML_TYPE_I8] = {
        .type_name                = "i8",
        .blck_size                = 1,
        .type_size                = sizeof(int8_t),
        .is_quantized             = false,
    },
    [GGML_TYPE_I16] = {
        .type_name                = "i16",
        .blck_size                = 1,
        .type_size                = sizeof(int16_t),
        .is_quantized             = false,
    },
    [GGML_TYPE_I32] = {
        .type_name                = "i32",
        .blck_size                = 1,
        .type_size                = sizeof(int32_t),
        .is_quantized             = false,
    },
    [GGML_TYPE_I64] = {
        .type_name                = "i64",
        .blck_size                = 1,
        .type_size                = sizeof(int64_t),
        .is_quantized             = false,
    },
    [GGML_TYPE_F64] = {
        .type_name                = "f64",
        .blck_size                = 1,
        .type_size                = sizeof(double),
        .is_quantized             = false,
    },
    [GGML_TYPE_F32] = {
        .type_name                = "f32",
        .blck_size                = 1,
        .type_size                = sizeof(float),
        .is_quantized             = false,
    },
    [GGML_TYPE_F16] = {
        .type_name                = "f16",
        .blck_size                = 1,
        .type_size                = sizeof(ggml_fp16_t),
        .is_quantized             = false,
        .to_float                 = (ggml_to_float_t) ggml_fp16_to_fp32_row,
        .from_float_ref           = (ggml_from_float_t) ggml_fp32_to_fp16_row,
    },
    [GGML_TYPE_Q4_0] = {
        .type_name                = "q4_0",
        .blck_size                = QK4_0,
        .type_size                = sizeof(block_q4_0),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q4_0,
        .from_float_ref           = (ggml_from_float_t) quantize_row_q4_0_ref,
    },
    [GGML_TYPE_Q4_1] = {
        .type_name                = "q4_1",
        .blck_size                = QK4_1,
        .type_size                = sizeof(block_q4_1),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
        .from_float_ref           = (ggml_from_float_t) quantize_row_q4_1_ref,
    },
    [4] = { // GGML_TYPE_Q4_2
        .type_name                = "DEPRECATED",
        .blck_size                = 0,
        .type_size                = 0,
        .is_quantized             = false,
    },
    [5] = { // GGML_TYPE_Q4_3
        .type_name                = "DEPRECATED",
        .blck_size                = 0,
        .type_size                = 0,
        .is_quantized             = false,
    },
    [GGML_TYPE_Q5_0] = {
        .type_name                = "q5_0",
        .blck_size                = QK5_0,
        .type_size                = sizeof(block_q5_0),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q5_0,
        .from_float_ref           = (ggml_from_float_t) quantize_row_q5_0_ref,
    },
    [GGML_TYPE_Q5_1] = {
        .type_name                = "q5_1",
        .blck_size                = QK5_1,
        .type_size                = sizeof(block_q5_1),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q5_1,
        .from_float_ref           = (ggml_from_float_t) quantize_row_q5_1_ref,
    },
    [GGML_TYPE_Q8_0] = {
        .type_name                = "q8_0",
        .blck_size                = QK8_0,
        .type_size                = sizeof(block_q8_0),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q8_0,
        .from_float_ref           = (ggml_from_float_t) quantize_row_q8_0_ref,
    },
    [GGML_TYPE_Q8_1] = {
        .type_name                = "q8_1",
        .blck_size                = QK8_1,
        .type_size                = sizeof(block_q8_1),
        .is_quantized             = true,
        .from_float_ref           = (ggml_from_float_t) quantize_row_q8_1_ref,
    },
    [GGML_TYPE_Q2_K] = {
        .type_name                = "q2_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q2_K),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q2_K,
        .from_float_ref           = (ggml_from_float_t) quantize_row_q2_K_ref,
    },
    [GGML_TYPE_Q3_K] = {
        .type_name                = "q3_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q3_K),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q3_K,
        .from_float_ref           = (ggml_from_float_t) quantize_row_q3_K_ref,
    },
    [GGML_TYPE_Q4_K] = {
        .type_name                = "q4_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q4_K),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q4_K,
        .from_float_ref           = (ggml_from_float_t) quantize_row_q4_K_ref,
    },
    [GGML_TYPE_Q5_K] = {
        .type_name                = "q5_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q5_K),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q5_K,
        .from_float_ref           = (ggml_from_float_t) quantize_row_q5_K_ref,
    },
    [GGML_TYPE_Q6_K] = {
        .type_name                = "q6_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q6_K),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_q6_K,
        .from_float_ref           = (ggml_from_float_t) quantize_row_q6_K_ref,
    },
    [GGML_TYPE_IQ2_XXS] = {
        .type_name                = "iq2_xxs",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_iq2_xxs),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xxs,
        .from_float_ref           = NULL,
    },
    [GGML_TYPE_IQ2_XS] = {
        .type_name                = "iq2_xs",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_iq2_xs),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_iq2_xs,
        .from_float_ref           = NULL,
    },
    [GGML_TYPE_IQ3_XXS] = {
        .type_name                = "iq3_xxs",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_iq3_xxs),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_iq3_xxs,
        .from_float_ref           = (ggml_from_float_t)quantize_row_iq3_xxs_ref,
    },
    [GGML_TYPE_IQ3_S] = {
        .type_name                = "iq3_s",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_iq3_s),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_iq3_s,
        .from_float_ref           = (ggml_from_float_t)quantize_row_iq3_s_ref,
    },
    [GGML_TYPE_IQ2_S] = {
        .type_name                = "iq2_s",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_iq2_s),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_iq2_s,
        .from_float_ref           = (ggml_from_float_t)quantize_row_iq2_s_ref,
    },
    [GGML_TYPE_IQ1_S] = {
        .type_name                = "iq1_s",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_iq1_s),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_iq1_s,
        .from_float_ref           = NULL,
    },
    [GGML_TYPE_IQ1_M] = {
        .type_name                = "iq1_m",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_iq1_m),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_iq1_m,
        .from_float_ref           = NULL,
    },
    [GGML_TYPE_IQ4_NL] = {
        .type_name                = "iq4_nl",
        .blck_size                = QK4_NL,
        .type_size                = sizeof(block_iq4_nl),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_iq4_nl,
        .from_float_ref           = (ggml_from_float_t)quantize_row_iq4_nl_ref,
    },
    [GGML_TYPE_IQ4_XS] = {
        .type_name                = "iq4_xs",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_iq4_xs),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_iq4_xs,
        .from_float_ref           = (ggml_from_float_t)quantize_row_iq4_xs_ref,
    },
    [GGML_TYPE_Q8_K] = {
        .type_name                = "q8_K",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_q8_K),
        .is_quantized             = true,
    },
    [GGML_TYPE_BF16] = {
        .type_name                = "bf16",
        .blck_size                = 1,
        .type_size                = sizeof(ggml_bf16_t),
        .is_quantized             = false,
        .to_float                 = (ggml_to_float_t) ggml_bf16_to_fp32_row,
        .from_float_ref           = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
    },
    [GGML_TYPE_Q4_0_4_4] = {
        .type_name                = "q4_0_4x4",
        .blck_size                = QK4_0,
        .blck_size_interleave     = 4,
        .type_size                = sizeof(block_q4_0),
        .is_quantized             = true,
        .to_float                 = NULL,
        .from_float_ref           = NULL,
    },
    [GGML_TYPE_Q4_0_4_8] = {
        .type_name                = "q4_0_4x8",
        .blck_size                = QK4_0,
        .blck_size_interleave     = 8,
        .type_size                = sizeof(block_q4_0),
        .is_quantized             = true,
        .to_float                 = NULL,
        .from_float_ref           = NULL,
    },
    [GGML_TYPE_Q4_0_8_8] = {
        .type_name                = "q4_0_8x8",
        .blck_size                = QK4_0,
        .blck_size_interleave     = 8,
        .type_size                = sizeof(block_q4_0),
        .is_quantized             = true,
        .to_float                 = NULL,
        .from_float_ref           = NULL,
xuxzh1's avatar
update  
xuxzh1 committed
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
    },
    [GGML_TYPE_TQ1_0] = {
        .type_name                = "tq1_0",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_tq1_0),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_tq1_0,
        .from_float_ref           = (ggml_from_float_t) quantize_row_tq1_0_ref,
    },
    [GGML_TYPE_TQ2_0] = {
        .type_name                = "tq2_0",
        .blck_size                = QK_K,
        .type_size                = sizeof(block_tq2_0),
        .is_quantized             = true,
        .to_float                 = (ggml_to_float_t) dequantize_row_tq2_0,
        .from_float_ref           = (ggml_from_float_t) quantize_row_tq2_0_ref,
    },
xuxzh1's avatar
init  
xuxzh1 committed
834
835
};

xuxzh1's avatar
update  
xuxzh1 committed
836
const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
xuxzh1's avatar
init  
xuxzh1 committed
837
    GGML_ASSERT(type < GGML_TYPE_COUNT);
xuxzh1's avatar
update  
xuxzh1 committed
838
    return &type_traits[type];
xuxzh1's avatar
init  
xuxzh1 committed
839
840
841
}

//
xuxzh1's avatar
update  
xuxzh1 committed
842
// ggml object
xuxzh1's avatar
init  
xuxzh1 committed
843
844
//

xuxzh1's avatar
update  
xuxzh1 committed
845
846
847
struct ggml_object {
    size_t offs;
    size_t size;
xuxzh1's avatar
init  
xuxzh1 committed
848

xuxzh1's avatar
update  
xuxzh1 committed
849
    struct ggml_object * next;
xuxzh1's avatar
init  
xuxzh1 committed
850

xuxzh1's avatar
update  
xuxzh1 committed
851
    enum ggml_object_type type;
xuxzh1's avatar
init  
xuxzh1 committed
852

xuxzh1's avatar
update  
xuxzh1 committed
853
854
    char padding[4];
};
xuxzh1's avatar
init  
xuxzh1 committed
855

xuxzh1's avatar
update  
xuxzh1 committed
856
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
xuxzh1's avatar
init  
xuxzh1 committed
857
858
859
860
861
862
863

//
// ggml context
//

struct ggml_context {
    size_t mem_size;
xuxzh1's avatar
update  
xuxzh1 committed
864
    void * mem_buffer;
xuxzh1's avatar
init  
xuxzh1 committed
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
    bool   mem_buffer_owned;
    bool   no_alloc;

    int    n_objects;

    struct ggml_object * objects_begin;
    struct ggml_object * objects_end;
};

struct ggml_context_container {
    bool used;

    struct ggml_context context;
};

xuxzh1's avatar
update  
xuxzh1 committed
880
881
882
//
// data types
//
xuxzh1's avatar
init  
xuxzh1 committed
883

xuxzh1's avatar
update  
xuxzh1 committed
884
885
static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "NONE",
xuxzh1's avatar
init  
xuxzh1 committed
886

xuxzh1's avatar
update  
xuxzh1 committed
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
    "DUP",
    "ADD",
    "ADD1",
    "ACC",
    "SUB",
    "MUL",
    "DIV",
    "SQR",
    "SQRT",
    "LOG",
    "SIN",
    "COS",
    "SUM",
    "SUM_ROWS",
    "MEAN",
    "ARGMAX",
    "COUNT_EQUAL",
    "REPEAT",
    "REPEAT_BACK",
    "CONCAT",
    "SILU_BACK",
    "NORM",
    "RMS_NORM",
    "RMS_NORM_BACK",
    "GROUP_NORM",
xuxzh1's avatar
init  
xuxzh1 committed
912

xuxzh1's avatar
update  
xuxzh1 committed
913
914
915
    "MUL_MAT",
    "MUL_MAT_ID",
    "OUT_PROD",
xuxzh1's avatar
init  
xuxzh1 committed
916

xuxzh1's avatar
update  
xuxzh1 committed
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
    "SCALE",
    "SET",
    "CPY",
    "CONT",
    "RESHAPE",
    "VIEW",
    "PERMUTE",
    "TRANSPOSE",
    "GET_ROWS",
    "GET_ROWS_BACK",
    "DIAG",
    "DIAG_MASK_INF",
    "DIAG_MASK_ZERO",
    "SOFT_MAX",
    "SOFT_MAX_BACK",
    "ROPE",
    "ROPE_BACK",
    "CLAMP",
    "CONV_TRANSPOSE_1D",
    "IM2COL",
    "IM2COL_BACK",
    "CONV_TRANSPOSE_2D",
    "POOL_1D",
    "POOL_2D",
    "POOL_2D_BACK",
    "UPSCALE",
    "PAD",
    "ARANGE",
    "TIMESTEP_EMBEDDING",
    "ARGSORT",
    "LEAKY_RELU",
xuxzh1's avatar
init  
xuxzh1 committed
948

xuxzh1's avatar
update  
xuxzh1 committed
949
950
951
952
953
954
955
956
957
    "FLASH_ATTN_EXT",
    "FLASH_ATTN_BACK",
    "SSM_CONV",
    "SSM_SCAN",
    "WIN_PART",
    "WIN_UNPART",
    "GET_REL_POS",
    "ADD_REL_POS",
    "RWKV_WKV6",
xuxzh1's avatar
init  
xuxzh1 committed
958

xuxzh1's avatar
update  
xuxzh1 committed
959
    "UNARY",
xuxzh1's avatar
init  
xuxzh1 committed
960

xuxzh1's avatar
update  
xuxzh1 committed
961
962
    "MAP_UNARY",
    "MAP_BINARY",
xuxzh1's avatar
init  
xuxzh1 committed
963

xuxzh1's avatar
update  
xuxzh1 committed
964
965
966
    "MAP_CUSTOM1_F32",
    "MAP_CUSTOM2_F32",
    "MAP_CUSTOM3_F32",
xuxzh1's avatar
init  
xuxzh1 committed
967

xuxzh1's avatar
update  
xuxzh1 committed
968
969
970
    "MAP_CUSTOM1",
    "MAP_CUSTOM2",
    "MAP_CUSTOM3",
xuxzh1's avatar
init  
xuxzh1 committed
971

xuxzh1's avatar
update  
xuxzh1 committed
972
973
974
975
    "CROSS_ENTROPY_LOSS",
    "CROSS_ENTROPY_LOSS_BACK",
    "OPT_STEP_ADAMW",
};
xuxzh1's avatar
init  
xuxzh1 committed
976

xuxzh1's avatar
update  
xuxzh1 committed
977
static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
xuxzh1's avatar
init  
xuxzh1 committed
978

xuxzh1's avatar
update  
xuxzh1 committed
979
980
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
xuxzh1's avatar
init  
xuxzh1 committed
981

xuxzh1's avatar
update  
xuxzh1 committed
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
    "x",
    "x+y",
    "x+y",
    "view(x,nb,offset)+=y->x",
    "x-y",
    "x*y",
    "x/y",
    "x^2",
    "√x",
    "log(x)",
    "sin(x)",
    "cos(x)",
    "Σx",
    "Σx_k",
    "Σx/n",
    "argmax(x)",
    "count_equal(x)",
    "repeat(x)",
    "repeat_back(x)",
    "concat(x, y)",
    "silu_back(x)",
    "norm(x)",
    "rms_norm(x)",
    "rms_norm_back(x)",
    "group_norm(x)",
xuxzh1's avatar
init  
xuxzh1 committed
1007

xuxzh1's avatar
update  
xuxzh1 committed
1008
1009
1010
    "X*Y",
    "X[i]*Y",
    "X*Y",
xuxzh1's avatar
init  
xuxzh1 committed
1011

xuxzh1's avatar
update  
xuxzh1 committed
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
    "x*v",
    "y-\\>view(x)",
    "x-\\>y",
    "cont(x)",
    "reshape(x)",
    "view(x)",
    "permute(x)",
    "transpose(x)",
    "get_rows(x)",
    "get_rows_back(x)",
    "diag(x)",
    "diag_mask_inf(x)",
    "diag_mask_zero(x)",
    "soft_max(x)",
    "soft_max_back(x)",
    "rope(x)",
    "rope_back(x)",
    "clamp(x)",
    "conv_transpose_1d(x)",
    "im2col(x)",
    "im2col_back(x)",
    "conv_transpose_2d(x)",
    "pool_1d(x)",
    "pool_2d(x)",
    "pool_2d_back(x)",
    "upscale(x)",
    "pad(x)",
    "arange(start, stop, step)",
    "timestep_embedding(timesteps, dim, max_period)",
    "argsort(x)",
    "leaky_relu(x)",
xuxzh1's avatar
init  
xuxzh1 committed
1043

xuxzh1's avatar
update  
xuxzh1 committed
1044
1045
1046
1047
1048
1049
1050
1051
1052
    "flash_attn_ext(x)",
    "flash_attn_back(x)",
    "ssm_conv(x)",
    "ssm_scan(x)",
    "win_part(x)",
    "win_unpart(x)",
    "get_rel_pos(x)",
    "add_rel_pos(x)",
    "rwkv_wkv6(k, v, r, tf, td, s)",
xuxzh1's avatar
init  
xuxzh1 committed
1053

xuxzh1's avatar
update  
xuxzh1 committed
1054
    "unary(x)",
xuxzh1's avatar
init  
xuxzh1 committed
1055

xuxzh1's avatar
update  
xuxzh1 committed
1056
1057
    "f(x)",
    "f(x,y)",
xuxzh1's avatar
init  
xuxzh1 committed
1058

xuxzh1's avatar
update  
xuxzh1 committed
1059
1060
1061
    "custom_f32(x)",
    "custom_f32(x,y)",
    "custom_f32(x,y,z)",
xuxzh1's avatar
init  
xuxzh1 committed
1062

xuxzh1's avatar
update  
xuxzh1 committed
1063
1064
1065
    "custom(x)",
    "custom(x,y)",
    "custom(x,y,z)",
xuxzh1's avatar
init  
xuxzh1 committed
1066

xuxzh1's avatar
update  
xuxzh1 committed
1067
1068
1069
1070
    "cross_entropy_loss(x,y)",
    "cross_entropy_loss_back(x,y)",
    "adamw(x)",
};
xuxzh1's avatar
init  
xuxzh1 committed
1071

xuxzh1's avatar
update  
xuxzh1 committed
1072
static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
xuxzh1's avatar
init  
xuxzh1 committed
1073

xuxzh1's avatar
update  
xuxzh1 committed
1074
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
xuxzh1's avatar
init  
xuxzh1 committed
1075
1076


xuxzh1's avatar
update  
xuxzh1 committed
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
    "ABS",
    "SGN",
    "NEG",
    "STEP",
    "TANH",
    "ELU",
    "RELU",
    "SIGMOID",
    "GELU",
    "GELU_QUICK",
    "SILU",
    "HARDSWISH",
    "HARDSIGMOID",
    "EXP",
};
xuxzh1's avatar
init  
xuxzh1 committed
1093

xuxzh1's avatar
update  
xuxzh1 committed
1094
static_assert(GGML_UNARY_OP_COUNT == 14, "GGML_UNARY_OP_COUNT != 14");
xuxzh1's avatar
init  
xuxzh1 committed
1095
1096


xuxzh1's avatar
update  
xuxzh1 committed
1097
1098
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
xuxzh1's avatar
init  
xuxzh1 committed
1099
1100


xuxzh1's avatar
update  
xuxzh1 committed
1101
////////////////////////////////////////////////////////////////////////////////
xuxzh1's avatar
init  
xuxzh1 committed
1102

xuxzh1's avatar
update  
xuxzh1 committed
1103
1104
1105
void ggml_print_object(const struct ggml_object * obj) {
    GGML_LOG_INFO(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
            obj->type, obj->offs, obj->size, (const void *) obj->next);
xuxzh1's avatar
init  
xuxzh1 committed
1106
1107
}

xuxzh1's avatar
update  
xuxzh1 committed
1108
1109
void ggml_print_objects(const struct ggml_context * ctx) {
    struct ggml_object * obj = ctx->objects_begin;
xuxzh1's avatar
init  
xuxzh1 committed
1110

xuxzh1's avatar
update  
xuxzh1 committed
1111
    GGML_LOG_INFO("%s: objects in context %p:\n", __func__, (const void *) ctx);
xuxzh1's avatar
init  
xuxzh1 committed
1112

xuxzh1's avatar
update  
xuxzh1 committed
1113
1114
1115
    while (obj != NULL) {
        ggml_print_object(obj);
        obj = obj->next;
xuxzh1's avatar
init  
xuxzh1 committed
1116
1117
    }

xuxzh1's avatar
update  
xuxzh1 committed
1118
1119
    GGML_LOG_INFO("%s: --- end ---\n", __func__);
}
xuxzh1's avatar
init  
xuxzh1 committed
1120

xuxzh1's avatar
update  
xuxzh1 committed
1121
1122
int64_t ggml_nelements(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
xuxzh1's avatar
init  
xuxzh1 committed
1123

xuxzh1's avatar
update  
xuxzh1 committed
1124
1125
    return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
}
xuxzh1's avatar
init  
xuxzh1 committed
1126

xuxzh1's avatar
update  
xuxzh1 committed
1127
1128
int64_t ggml_nrows(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
xuxzh1's avatar
init  
xuxzh1 committed
1129

xuxzh1's avatar
update  
xuxzh1 committed
1130
1131
    return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
}
xuxzh1's avatar
init  
xuxzh1 committed
1132

xuxzh1's avatar
update  
xuxzh1 committed
1133
1134
1135
1136
1137
1138
1139
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
    size_t nbytes;
    const size_t blck_size = ggml_blck_size(tensor->type);
    if (blck_size == 1) {
        nbytes = ggml_type_size(tensor->type);
        for (int i = 0; i < GGML_MAX_DIMS; ++i) {
            nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
xuxzh1's avatar
init  
xuxzh1 committed
1140
1141
        }
    }
xuxzh1's avatar
update  
xuxzh1 committed
1142
1143
1144
1145
    else {
        nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
        for (int i = 1; i < GGML_MAX_DIMS; ++i) {
            nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
xuxzh1's avatar
init  
xuxzh1 committed
1146
1147
1148
        }
    }

xuxzh1's avatar
update  
xuxzh1 committed
1149
    return nbytes;
xuxzh1's avatar
init  
xuxzh1 committed
1150
1151
}

xuxzh1's avatar
update  
xuxzh1 committed
1152
1153
size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
    return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
xuxzh1's avatar
init  
xuxzh1 committed
1154
1155
}

xuxzh1's avatar
update  
xuxzh1 committed
1156
1157
1158
int64_t ggml_blck_size(enum ggml_type type) {
    return type_traits[type].blck_size;
}
xuxzh1's avatar
init  
xuxzh1 committed
1159

xuxzh1's avatar
update  
xuxzh1 committed
1160
1161
1162
size_t ggml_type_size(enum ggml_type type) {
    return type_traits[type].type_size;
}
xuxzh1's avatar
init  
xuxzh1 committed
1163

xuxzh1's avatar
update  
xuxzh1 committed
1164
1165
1166
1167
size_t ggml_row_size(enum ggml_type type, int64_t ne) {
    assert(ne % ggml_blck_size(type) == 0);
    return ggml_type_size(type)*ne/ggml_blck_size(type);
}
xuxzh1's avatar
init  
xuxzh1 committed
1168

xuxzh1's avatar
update  
xuxzh1 committed
1169
1170
double ggml_type_sizef(enum ggml_type type) {
    return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
xuxzh1's avatar
init  
xuxzh1 committed
1171
1172
}

xuxzh1's avatar
update  
xuxzh1 committed
1173
1174
1175
const char * ggml_type_name(enum ggml_type type) {
    return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE";
}
xuxzh1's avatar
init  
xuxzh1 committed
1176

xuxzh1's avatar
update  
xuxzh1 committed
1177
1178
1179
bool ggml_is_quantized(enum ggml_type type) {
    return type_traits[type].is_quantized;
}
xuxzh1's avatar
init  
xuxzh1 committed
1180

xuxzh1's avatar
update  
xuxzh1 committed
1181
1182
1183
const char * ggml_op_name(enum ggml_op op) {
    return GGML_OP_NAME[op];
}
xuxzh1's avatar
init  
xuxzh1 committed
1184

xuxzh1's avatar
update  
xuxzh1 committed
1185
1186
1187
const char * ggml_op_symbol(enum ggml_op op) {
    return GGML_OP_SYMBOL[op];
}
xuxzh1's avatar
init  
xuxzh1 committed
1188

xuxzh1's avatar
update  
xuxzh1 committed
1189
1190
1191
const char * ggml_unary_op_name(enum ggml_unary_op op) {
    return GGML_UNARY_OP_NAME[op];
}
xuxzh1's avatar
init  
xuxzh1 committed
1192

xuxzh1's avatar
update  
xuxzh1 committed
1193
1194
1195
1196
const char * ggml_op_desc(const struct ggml_tensor * t) {
    if (t->op == GGML_OP_UNARY) {
        enum ggml_unary_op uop = ggml_get_unary_op(t);
        return ggml_unary_op_name(uop);
xuxzh1's avatar
init  
xuxzh1 committed
1197
    }
xuxzh1's avatar
update  
xuxzh1 committed
1198
1199
    return ggml_op_name(t->op);
}
xuxzh1's avatar
init  
xuxzh1 committed
1200

xuxzh1's avatar
update  
xuxzh1 committed
1201
1202
1203
size_t ggml_element_size(const struct ggml_tensor * tensor) {
    return ggml_type_size(tensor->type);
}
xuxzh1's avatar
init  
xuxzh1 committed
1204

xuxzh1's avatar
update  
xuxzh1 committed
1205
1206
bool ggml_is_scalar(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
xuxzh1's avatar
init  
xuxzh1 committed
1207

xuxzh1's avatar
update  
xuxzh1 committed
1208
1209
    return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
}
xuxzh1's avatar
init  
xuxzh1 committed
1210

xuxzh1's avatar
update  
xuxzh1 committed
1211
1212
bool ggml_is_vector(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
xuxzh1's avatar
init  
xuxzh1 committed
1213

xuxzh1's avatar
update  
xuxzh1 committed
1214
    return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
xuxzh1's avatar
init  
xuxzh1 committed
1215
1216
}

xuxzh1's avatar
update  
xuxzh1 committed
1217
1218
bool ggml_is_matrix(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
xuxzh1's avatar
init  
xuxzh1 committed
1219

xuxzh1's avatar
update  
xuxzh1 committed
1220
1221
    return tensor->ne[2] == 1 && tensor->ne[3] == 1;
}
xuxzh1's avatar
init  
xuxzh1 committed
1222

xuxzh1's avatar
update  
xuxzh1 committed
1223
1224
1225
bool ggml_is_3d(const struct ggml_tensor * tensor) {
    return tensor->ne[3] == 1;
}
xuxzh1's avatar
init  
xuxzh1 committed
1226

xuxzh1's avatar
update  
xuxzh1 committed
1227
1228
1229
1230
int ggml_n_dims(const struct ggml_tensor * tensor) {
    for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
        if (tensor->ne[i] > 1) {
            return i + 1;
xuxzh1's avatar
init  
xuxzh1 committed
1231
1232
        }
    }
xuxzh1's avatar
update  
xuxzh1 committed
1233
    return 1;
xuxzh1's avatar
init  
xuxzh1 committed
1234
1235
}

xuxzh1's avatar
update  
xuxzh1 committed
1236
1237
enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
    enum ggml_type wtype = GGML_TYPE_COUNT;
xuxzh1's avatar
init  
xuxzh1 committed
1238

xuxzh1's avatar
update  
xuxzh1 committed
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
    switch (ftype) {
        case GGML_FTYPE_ALL_F32:              wtype = GGML_TYPE_F32;   break;
        case GGML_FTYPE_MOSTLY_F16:           wtype = GGML_TYPE_F16;   break;
        case GGML_FTYPE_MOSTLY_BF16:          wtype = GGML_TYPE_BF16;  break;
        case GGML_FTYPE_MOSTLY_Q4_0:          wtype = GGML_TYPE_Q4_0;  break;
        case GGML_FTYPE_MOSTLY_Q4_1:          wtype = GGML_TYPE_Q4_1;  break;
        case GGML_FTYPE_MOSTLY_Q5_0:          wtype = GGML_TYPE_Q5_0;  break;
        case GGML_FTYPE_MOSTLY_Q5_1:          wtype = GGML_TYPE_Q5_1;  break;
        case GGML_FTYPE_MOSTLY_Q8_0:          wtype = GGML_TYPE_Q8_0;  break;
        case GGML_FTYPE_MOSTLY_Q2_K:          wtype = GGML_TYPE_Q2_K;  break;
        case GGML_FTYPE_MOSTLY_Q3_K:          wtype = GGML_TYPE_Q3_K;  break;
        case GGML_FTYPE_MOSTLY_Q4_K:          wtype = GGML_TYPE_Q4_K;  break;
        case GGML_FTYPE_MOSTLY_Q5_K:          wtype = GGML_TYPE_Q5_K;  break;
        case GGML_FTYPE_MOSTLY_Q6_K:          wtype = GGML_TYPE_Q6_K;  break;
        case GGML_FTYPE_MOSTLY_IQ2_XXS:       wtype = GGML_TYPE_IQ2_XXS;  break;
        case GGML_FTYPE_MOSTLY_IQ2_XS:        wtype = GGML_TYPE_IQ2_XS;   break;
        case GGML_FTYPE_MOSTLY_IQ3_XXS:       wtype = GGML_TYPE_IQ3_XXS;  break;
        case GGML_FTYPE_MOSTLY_IQ1_S:         wtype = GGML_TYPE_IQ1_S;    break;
        case GGML_FTYPE_MOSTLY_IQ1_M:         wtype = GGML_TYPE_IQ1_M;    break;
        case GGML_FTYPE_MOSTLY_IQ4_NL:        wtype = GGML_TYPE_IQ4_NL;   break;
        case GGML_FTYPE_MOSTLY_IQ4_XS:        wtype = GGML_TYPE_IQ4_XS;   break;
        case GGML_FTYPE_MOSTLY_IQ3_S:         wtype = GGML_TYPE_IQ3_S;    break;
        case GGML_FTYPE_MOSTLY_IQ2_S:         wtype = GGML_TYPE_IQ2_S;    break;
        case GGML_FTYPE_MOSTLY_Q4_0_4_4:      wtype = GGML_TYPE_Q4_0_4_4; break;
        case GGML_FTYPE_MOSTLY_Q4_0_4_8:      wtype = GGML_TYPE_Q4_0_4_8; break;
        case GGML_FTYPE_MOSTLY_Q4_0_8_8:      wtype = GGML_TYPE_Q4_0_8_8; break;
        case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
xuxzh1's avatar
init  
xuxzh1 committed
1267
    }
xuxzh1's avatar
update  
xuxzh1 committed
1268
1269
1270
1271

    GGML_ASSERT(wtype != GGML_TYPE_COUNT);

    return wtype;
xuxzh1's avatar
init  
xuxzh1 committed
1272
1273
}

xuxzh1's avatar
update  
xuxzh1 committed
1274
1275
size_t ggml_tensor_overhead(void) {
    return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
xuxzh1's avatar
init  
xuxzh1 committed
1276
1277
}

xuxzh1's avatar
update  
xuxzh1 committed
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
bool ggml_is_transposed(const struct ggml_tensor * tensor) {
    return tensor->nb[0] > tensor->nb[1];
}

static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
    size_t next_nb = ggml_type_size(tensor->type);
    if (tensor->ne[0] != ggml_blck_size(tensor->type) && tensor->nb[0] != next_nb) {
        return false;
    }
    next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
    for (int i = 1; i < GGML_MAX_DIMS; i++) {
        if (tensor->ne[i] != 1) {
            if (i > n) {
                if (tensor->nb[i] != next_nb) {
                    return false;
                }
                next_nb *= tensor->ne[i];
            } else {
                // this dimension does not need to be contiguous
                next_nb = tensor->ne[i]*tensor->nb[i];
            }
xuxzh1's avatar
init  
xuxzh1 committed
1299
1300
        }
    }
xuxzh1's avatar
update  
xuxzh1 committed
1301
    return true;
xuxzh1's avatar
init  
xuxzh1 committed
1302
}
xuxzh1's avatar
update  
xuxzh1 committed
1303
1304
1305

bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
    return ggml_is_contiguous_0(tensor);
xuxzh1's avatar
init  
xuxzh1 committed
1306
1307
}

xuxzh1's avatar
update  
xuxzh1 committed
1308
1309
bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
    return ggml_is_contiguous_n(tensor, 0);
xuxzh1's avatar
init  
xuxzh1 committed
1310
1311
}

xuxzh1's avatar
update  
xuxzh1 committed
1312
1313
1314
bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
    return ggml_is_contiguous_n(tensor, 1);
}
xuxzh1's avatar
init  
xuxzh1 committed
1315

xuxzh1's avatar
update  
xuxzh1 committed
1316
1317
bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
    return ggml_is_contiguous_n(tensor, 2);
xuxzh1's avatar
init  
xuxzh1 committed
1318
}
xuxzh1's avatar
update  
xuxzh1 committed
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340

bool ggml_is_permuted(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
}

static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return
        tensor->nb[0] == ggml_type_size(tensor->type) &&
        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}

bool ggml_is_empty(const struct ggml_tensor * tensor) {
    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
        if (tensor->ne[i] == 0) {
            // empty if any dimension has no elements
            return true;
        }
xuxzh1's avatar
init  
xuxzh1 committed
1341
    }
xuxzh1's avatar
update  
xuxzh1 committed
1342
    return false;
xuxzh1's avatar
init  
xuxzh1 committed
1343
1344
}

xuxzh1's avatar
update  
xuxzh1 committed
1345
1346
1347
1348
1349
1350
1351
1352
bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return
        (t0->ne[0] == t1->ne[0]) &&
        (t0->ne[1] == t1->ne[1]) &&
        (t0->ne[2] == t1->ne[2]) &&
        (t0->ne[3] == t1->ne[3]);
xuxzh1's avatar
init  
xuxzh1 committed
1353
1354
}

xuxzh1's avatar
update  
xuxzh1 committed
1355
1356
bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
xuxzh1's avatar
init  
xuxzh1 committed
1357

xuxzh1's avatar
update  
xuxzh1 committed
1358
1359
1360
1361
1362
1363
    return
        (t0->nb[0] == t1->nb[0]) &&
        (t0->nb[1] == t1->nb[1]) &&
        (t0->nb[2] == t1->nb[2]) &&
        (t0->nb[3] == t1->nb[3]);
}
xuxzh1's avatar
init  
xuxzh1 committed
1364

xuxzh1's avatar
update  
xuxzh1 committed
1365
1366
1367
1368
1369
1370
1371
1372
1373
// check if t1 can be represented as a repeatition of t0
bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return ggml_is_empty(t0) ? ggml_is_empty(t1) :
        (t1->ne[0]%t0->ne[0] == 0) &&
        (t1->ne[1]%t0->ne[1] == 0) &&
        (t1->ne[2]%t0->ne[2] == 0) &&
        (t1->ne[3]%t0->ne[3] == 0);
xuxzh1's avatar
init  
xuxzh1 committed
1374
1375
}

xuxzh1's avatar
update  
xuxzh1 committed
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1);
}

// assert that pointer is aligned to GGML_MEM_ALIGN
#define GGML_ASSERT_ALIGNED(ptr) \
    GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)

////////////////////////////////////////////////////////////////////////////////

struct ggml_context * ggml_init(struct ggml_init_params params) {
    static bool is_first_call = true;

    ggml_critical_section_start();

    if (is_first_call) {
        // initialize time system (required on Windows)
        ggml_time_init();

        for (int i = 0; i < (1 << 16); ++i) {
            union {
                uint16_t u16;
                ggml_fp16_t fp16;
            } u = {i};
            ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
        }

        is_first_call = false;
xuxzh1's avatar
init  
xuxzh1 committed
1406
    }
xuxzh1's avatar
update  
xuxzh1 committed
1407
1408
1409
1410
1411
1412
1413
1414

    ggml_critical_section_end();

    struct ggml_context * ctx = GGML_MALLOC(sizeof(struct ggml_context));

    // allow to call ggml_init with 0 size
    if (params.mem_size == 0) {
        params.mem_size = GGML_MEM_ALIGN;
xuxzh1's avatar
init  
xuxzh1 committed
1415
1416
    }

xuxzh1's avatar
update  
xuxzh1 committed
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
    const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);

    *ctx = (struct ggml_context) {
        /*.mem_size           =*/ mem_size,
        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size),
        /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
        /*.no_alloc           =*/ params.no_alloc,
        /*.n_objects          =*/ 0,
        /*.objects_begin      =*/ NULL,
        /*.objects_end        =*/ NULL,
    };

    GGML_ASSERT(ctx->mem_buffer != NULL);

    GGML_ASSERT_ALIGNED(ctx->mem_buffer);

    GGML_PRINT_DEBUG("%s: context initialized\n", __func__);

    return ctx;
xuxzh1's avatar
init  
xuxzh1 committed
1436
1437
}

xuxzh1's avatar
update  
xuxzh1 committed
1438
1439
1440
void ggml_reset(struct ggml_context * ctx) {
    if (ctx == NULL) {
        return;
xuxzh1's avatar
init  
xuxzh1 committed
1441
    }
xuxzh1's avatar
update  
xuxzh1 committed
1442
1443
1444
1445

    ctx->n_objects     = 0;
    ctx->objects_begin = NULL;
    ctx->objects_end   = NULL;
xuxzh1's avatar
init  
xuxzh1 committed
1446
1447
}

xuxzh1's avatar
update  
xuxzh1 committed
1448
1449
1450
void ggml_free(struct ggml_context * ctx) {
    if (ctx == NULL) {
        return;
xuxzh1's avatar
init  
xuxzh1 committed
1451
1452
    }

xuxzh1's avatar
update  
xuxzh1 committed
1453
1454
    if (ctx->mem_buffer_owned) {
        ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
xuxzh1's avatar
init  
xuxzh1 committed
1455
    }
xuxzh1's avatar
update  
xuxzh1 committed
1456
1457

    GGML_FREE(ctx);
xuxzh1's avatar
init  
xuxzh1 committed
1458
1459
}

xuxzh1's avatar
update  
xuxzh1 committed
1460
1461
size_t ggml_used_mem(const struct ggml_context * ctx) {
    return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
xuxzh1's avatar
init  
xuxzh1 committed
1462
1463
}

xuxzh1's avatar
update  
xuxzh1 committed
1464
1465
bool ggml_get_no_alloc(struct ggml_context * ctx) {
    return ctx->no_alloc;
xuxzh1's avatar
init  
xuxzh1 committed
1466
1467
}

xuxzh1's avatar
update  
xuxzh1 committed
1468
1469
void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
    ctx->no_alloc = no_alloc;
xuxzh1's avatar
init  
xuxzh1 committed
1470
1471
}

xuxzh1's avatar
update  
xuxzh1 committed
1472
1473
void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
    return ctx->mem_buffer;
xuxzh1's avatar
init  
xuxzh1 committed
1474
1475
}

xuxzh1's avatar
update  
xuxzh1 committed
1476
1477
size_t ggml_get_mem_size(const struct ggml_context * ctx) {
    return ctx->mem_size;
xuxzh1's avatar
init  
xuxzh1 committed
1478
1479
}

xuxzh1's avatar
update  
xuxzh1 committed
1480
1481
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
    size_t max_size = 0;
xuxzh1's avatar
init  
xuxzh1 committed
1482

xuxzh1's avatar
update  
xuxzh1 committed
1483
1484
1485
1486
    for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
        size_t bytes = ggml_nbytes(tensor);
        max_size = MAX(max_size, bytes);
    }
xuxzh1's avatar
init  
xuxzh1 committed
1487

xuxzh1's avatar
update  
xuxzh1 committed
1488
1489
    return max_size;
}
xuxzh1's avatar
init  
xuxzh1 committed
1490

xuxzh1's avatar
update  
xuxzh1 committed
1491
////////////////////////////////////////////////////////////////////////////////
xuxzh1's avatar
init  
xuxzh1 committed
1492

xuxzh1's avatar
update  
xuxzh1 committed
1493
1494
1495
static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
    // always insert objects at the end of the context's memory pool
    struct ggml_object * obj_cur = ctx->objects_end;
xuxzh1's avatar
init  
xuxzh1 committed
1496

xuxzh1's avatar
update  
xuxzh1 committed
1497
1498
1499
    const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs;
    const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
    const size_t cur_end  = cur_offs + cur_size;
xuxzh1's avatar
init  
xuxzh1 committed
1500

xuxzh1's avatar
update  
xuxzh1 committed
1501
1502
    // align to GGML_MEM_ALIGN
    size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
xuxzh1's avatar
init  
xuxzh1 committed
1503

xuxzh1's avatar
update  
xuxzh1 committed
1504
1505
    char * const mem_buffer = ctx->mem_buffer;
    struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
xuxzh1's avatar
init  
xuxzh1 committed
1506

xuxzh1's avatar
update  
xuxzh1 committed
1507
1508
1509
1510
1511
1512
1513
1514
    if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
        GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
                __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
#ifndef NDEBUG
        GGML_ABORT("not enough space in the context's memory pool");
#endif
        return NULL;
    }
xuxzh1's avatar
init  
xuxzh1 committed
1515

xuxzh1's avatar
update  
xuxzh1 committed
1516
1517
1518
1519
1520
1521
    *obj_new = (struct ggml_object) {
        .offs = cur_end + GGML_OBJECT_SIZE,
        .size = size_needed,
        .next = NULL,
        .type = type,
    };
xuxzh1's avatar
init  
xuxzh1 committed
1522

xuxzh1's avatar
update  
xuxzh1 committed
1523
    GGML_ASSERT_ALIGNED(mem_buffer + obj_new->offs);
xuxzh1's avatar
init  
xuxzh1 committed
1524

xuxzh1's avatar
update  
xuxzh1 committed
1525
1526
1527
1528
1529
1530
    if (obj_cur != NULL) {
        obj_cur->next = obj_new;
    } else {
        // this is the first object in this context
        ctx->objects_begin = obj_new;
    }
xuxzh1's avatar
init  
xuxzh1 committed
1531

xuxzh1's avatar
update  
xuxzh1 committed
1532
    ctx->objects_end = obj_new;
xuxzh1's avatar
init  
xuxzh1 committed
1533

xuxzh1's avatar
update  
xuxzh1 committed
1534
    //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
xuxzh1's avatar
init  
xuxzh1 committed
1535

xuxzh1's avatar
update  
xuxzh1 committed
1536
1537
    return obj_new;
}
xuxzh1's avatar
init  
xuxzh1 committed
1538

xuxzh1's avatar
update  
xuxzh1 committed
1539
1540
1541
1542
1543
1544
1545
static struct ggml_tensor * ggml_new_tensor_impl(
        struct ggml_context * ctx,
        enum   ggml_type      type,
        int                   n_dims,
        const int64_t       * ne,
        struct ggml_tensor  * view_src,
        size_t                view_offs) {
xuxzh1's avatar
init  
xuxzh1 committed
1546

xuxzh1's avatar
update  
xuxzh1 committed
1547
1548
    GGML_ASSERT(type >= 0 && type < GGML_TYPE_COUNT);
    GGML_ASSERT(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
xuxzh1's avatar
init  
xuxzh1 committed
1549

xuxzh1's avatar
update  
xuxzh1 committed
1550
1551
1552
1553
1554
    // find the base tensor and absolute offset
    if (view_src != NULL && view_src->view_src != NULL) {
        view_offs += view_src->view_offs;
        view_src   = view_src->view_src;
    }
xuxzh1's avatar
init  
xuxzh1 committed
1555

xuxzh1's avatar
update  
xuxzh1 committed
1556
1557
1558
1559
    size_t data_size = ggml_row_size(type, ne[0]);
    for (int i = 1; i < n_dims; i++) {
        data_size *= ne[i];
    }
xuxzh1's avatar
init  
xuxzh1 committed
1560

xuxzh1's avatar
update  
xuxzh1 committed
1561
    GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src));
xuxzh1's avatar
init  
xuxzh1 committed
1562

xuxzh1's avatar
update  
xuxzh1 committed
1563
1564
1565
1566
    void * data = view_src != NULL ? view_src->data : NULL;
    if (data != NULL) {
        data = (char *) data + view_offs;
    }
xuxzh1's avatar
init  
xuxzh1 committed
1567

xuxzh1's avatar
update  
xuxzh1 committed
1568
    size_t obj_alloc_size = 0;
xuxzh1's avatar
init  
xuxzh1 committed
1569

xuxzh1's avatar
update  
xuxzh1 committed
1570
1571
1572
1573
    if (view_src == NULL && !ctx->no_alloc) {
        // allocate tensor data in the context's memory pool
        obj_alloc_size = data_size;
    }
xuxzh1's avatar
init  
xuxzh1 committed
1574

xuxzh1's avatar
update  
xuxzh1 committed
1575
1576
    struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
    GGML_ASSERT(obj_new);
xuxzh1's avatar
init  
xuxzh1 committed
1577

xuxzh1's avatar
update  
xuxzh1 committed
1578
    struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
xuxzh1's avatar
init  
xuxzh1 committed
1579

xuxzh1's avatar
update  
xuxzh1 committed
1580
1581
1582
1583
#ifdef __clang__
    // temporary until ggml_tensor::backend is removed
    #pragma clang diagnostic push
    #pragma clang diagnostic ignored "-Wdeprecated-declarations"
xuxzh1's avatar
init  
xuxzh1 committed
1584
1585
#endif

xuxzh1's avatar
update  
xuxzh1 committed
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
    *result = (struct ggml_tensor) {
        /*.type         =*/ type,
        /*.backend      =*/ GGML_BACKEND_TYPE_CPU,
        /*.buffer       =*/ NULL,
        /*.ne           =*/ { 1, 1, 1, 1 },
        /*.nb           =*/ { 0, 0, 0, 0 },
        /*.op           =*/ GGML_OP_NONE,
        /*.op_params    =*/ { 0 },
        /*.flags        =*/ 0,
        /*.src          =*/ { NULL },
        /*.view_src     =*/ view_src,
        /*.view_offs    =*/ view_offs,
        /*.data         =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
        /*.name         =*/ { 0 },
        /*.extra        =*/ NULL,
        /*.padding      =*/ { 0 },
    };
xuxzh1's avatar
init  
xuxzh1 committed
1603

xuxzh1's avatar
update  
xuxzh1 committed
1604
1605
1606
#ifdef __clang__
    #pragma clang diagnostic pop
#endif
xuxzh1's avatar
init  
xuxzh1 committed
1607

xuxzh1's avatar
update  
xuxzh1 committed
1608
1609
    // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
    //GGML_ASSERT_ALIGNED(result->data);
xuxzh1's avatar
init  
xuxzh1 committed
1610

xuxzh1's avatar
update  
xuxzh1 committed
1611
1612
    for (int i = 0; i < n_dims; i++) {
        result->ne[i] = ne[i];
xuxzh1's avatar
init  
xuxzh1 committed
1613
1614
    }

xuxzh1's avatar
update  
xuxzh1 committed
1615
1616
1617
1618
    result->nb[0] = ggml_type_size(type);
    result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
    for (int i = 2; i < GGML_MAX_DIMS; i++) {
        result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
xuxzh1's avatar
init  
xuxzh1 committed
1619
1620
    }

xuxzh1's avatar
update  
xuxzh1 committed
1621
    ctx->n_objects++;
xuxzh1's avatar
init  
xuxzh1 committed
1622

xuxzh1's avatar
update  
xuxzh1 committed
1623
1624
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
1625

xuxzh1's avatar
update  
xuxzh1 committed
1626
1627
1628
1629
1630
1631
struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
        enum   ggml_type      type,
        int                   n_dims,
        const int64_t       * ne) {
    return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
xuxzh1's avatar
init  
xuxzh1 committed
1632
1633
}

xuxzh1's avatar
update  
xuxzh1 committed
1634
1635
1636
1637
1638
struct ggml_tensor * ggml_new_tensor_1d(
        struct ggml_context * ctx,
        enum   ggml_type      type,
        int64_t ne0) {
    return ggml_new_tensor(ctx, type, 1, &ne0);
xuxzh1's avatar
init  
xuxzh1 committed
1639
1640
}

xuxzh1's avatar
update  
xuxzh1 committed
1641
1642
1643
1644
1645
1646
1647
struct ggml_tensor * ggml_new_tensor_2d(
        struct ggml_context * ctx,
        enum   ggml_type      type,
        int64_t ne0,
        int64_t ne1) {
    const int64_t ne[2] = { ne0, ne1 };
    return ggml_new_tensor(ctx, type, 2, ne);
xuxzh1's avatar
init  
xuxzh1 committed
1648
}
xuxzh1's avatar
update  
xuxzh1 committed
1649
1650
1651
1652
1653
1654
1655
1656
1657

struct ggml_tensor * ggml_new_tensor_3d(
        struct ggml_context * ctx,
        enum   ggml_type      type,
        int64_t ne0,
        int64_t ne1,
        int64_t ne2) {
    const int64_t ne[3] = { ne0, ne1, ne2 };
    return ggml_new_tensor(ctx, type, 3, ne);
xuxzh1's avatar
init  
xuxzh1 committed
1658
1659
}

xuxzh1's avatar
update  
xuxzh1 committed
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
struct ggml_tensor * ggml_new_tensor_4d(
        struct ggml_context * ctx,
        enum   ggml_type type,
        int64_t ne0,
        int64_t ne1,
        int64_t ne2,
        int64_t ne3) {
    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
    return ggml_new_tensor(ctx, type, 4, ne);
}
xuxzh1's avatar
init  
xuxzh1 committed
1670

xuxzh1's avatar
update  
xuxzh1 committed
1671
1672
void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes) {
    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, nbytes);
xuxzh1's avatar
init  
xuxzh1 committed
1673

xuxzh1's avatar
update  
xuxzh1 committed
1674
1675
    return (uint8_t *)ctx->mem_buffer + obj->offs;
}
xuxzh1's avatar
init  
xuxzh1 committed
1676

xuxzh1's avatar
update  
xuxzh1 committed
1677
1678
1679
struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
    return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
}
xuxzh1's avatar
init  
xuxzh1 committed
1680

xuxzh1's avatar
update  
xuxzh1 committed
1681
1682
1683
1684
void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3) {
    const int64_t ne2 = tensor->ne[2];
    const int64_t ne1 = tensor->ne[1];
    const int64_t ne0 = tensor->ne[0];
xuxzh1's avatar
init  
xuxzh1 committed
1685

xuxzh1's avatar
update  
xuxzh1 committed
1686
1687
1688
1689
    const int64_t i3_ = (i/(ne2*ne1*ne0));
    const int64_t i2_ = (i - i3_*ne2*ne1*ne0)/(ne1*ne0);
    const int64_t i1_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0)/ne0;
    const int64_t i0_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0 - i1_*ne0);
xuxzh1's avatar
init  
xuxzh1 committed
1690

xuxzh1's avatar
update  
xuxzh1 committed
1691
1692
    if (i0) {
        * i0 = i0_;
xuxzh1's avatar
init  
xuxzh1 committed
1693
    }
xuxzh1's avatar
update  
xuxzh1 committed
1694
1695
1696
1697
1698
1699
1700
1701
    if (i1) {
        * i1 = i1_;
    }
    if (i2) {
        * i2 = i2_;
    }
    if (i3) {
        * i3 = i3_;
xuxzh1's avatar
init  
xuxzh1 committed
1702
    }
xuxzh1's avatar
update  
xuxzh1 committed
1703
}
xuxzh1's avatar
init  
xuxzh1 committed
1704

xuxzh1's avatar
update  
xuxzh1 committed
1705
1706
1707
void * ggml_get_data(const struct ggml_tensor * tensor) {
    return tensor->data;
}
xuxzh1's avatar
init  
xuxzh1 committed
1708

xuxzh1's avatar
update  
xuxzh1 committed
1709
1710
1711
1712
float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
    assert(tensor->type == GGML_TYPE_F32);
    return (float *)(tensor->data);
}
xuxzh1's avatar
init  
xuxzh1 committed
1713

xuxzh1's avatar
update  
xuxzh1 committed
1714
1715
1716
1717
enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
    GGML_ASSERT(tensor->op == GGML_OP_UNARY);
    return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
}
xuxzh1's avatar
init  
xuxzh1 committed
1718

xuxzh1's avatar
update  
xuxzh1 committed
1719
1720
1721
const char * ggml_get_name(const struct ggml_tensor * tensor) {
    return tensor->name;
}
xuxzh1's avatar
init  
xuxzh1 committed
1722

xuxzh1's avatar
update  
xuxzh1 committed
1723
1724
1725
1726
struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
    size_t i;
    for (i = 0; i < sizeof(tensor->name) - 1 && name[i] != '\0'; i++) {
        tensor->name[i] = name[i];
xuxzh1's avatar
init  
xuxzh1 committed
1727
    }
xuxzh1's avatar
update  
xuxzh1 committed
1728
1729
    tensor->name[i] = '\0';
    return tensor;
xuxzh1's avatar
init  
xuxzh1 committed
1730
1731
}

xuxzh1's avatar
update  
xuxzh1 committed
1732
1733
1734
1735
1736
1737
struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
    va_list args;
    va_start(args, fmt);
    vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
    va_end(args);
    return tensor;
xuxzh1's avatar
init  
xuxzh1 committed
1738
1739
}

xuxzh1's avatar
update  
xuxzh1 committed
1740
1741
1742
1743
1744
struct ggml_tensor * ggml_view_tensor(
        struct ggml_context * ctx,
        struct ggml_tensor  * src) {
    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0);
    ggml_format_name(result, "%s (view)", src->name);
xuxzh1's avatar
init  
xuxzh1 committed
1745

xuxzh1's avatar
update  
xuxzh1 committed
1746
1747
1748
1749
1750
    for (int i = 0; i < GGML_MAX_DIMS; i++) {
        result->nb[i] = src->nb[i];
    }

    return result;
xuxzh1's avatar
init  
xuxzh1 committed
1751
1752
}

xuxzh1's avatar
update  
xuxzh1 committed
1753
struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
xuxzh1's avatar
init  
xuxzh1 committed
1754
1755
    struct ggml_object * obj = ctx->objects_begin;

xuxzh1's avatar
update  
xuxzh1 committed
1756
    char * const mem_buffer = ctx->mem_buffer;
xuxzh1's avatar
init  
xuxzh1 committed
1757
1758

    while (obj != NULL) {
xuxzh1's avatar
update  
xuxzh1 committed
1759
1760
1761
1762
        if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
            return (struct ggml_tensor *)(mem_buffer + obj->offs);
        }

xuxzh1's avatar
init  
xuxzh1 committed
1763
1764
1765
        obj = obj->next;
    }

xuxzh1's avatar
update  
xuxzh1 committed
1766
    return NULL;
xuxzh1's avatar
init  
xuxzh1 committed
1767
1768
}

xuxzh1's avatar
update  
xuxzh1 committed
1769
1770
1771
struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
    struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
    obj = obj->next;
xuxzh1's avatar
init  
xuxzh1 committed
1772

xuxzh1's avatar
update  
xuxzh1 committed
1773
    char * const mem_buffer = ctx->mem_buffer;
xuxzh1's avatar
init  
xuxzh1 committed
1774

xuxzh1's avatar
update  
xuxzh1 committed
1775
1776
1777
    while (obj != NULL) {
        if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
            return (struct ggml_tensor *)(mem_buffer + obj->offs);
xuxzh1's avatar
init  
xuxzh1 committed
1778
        }
xuxzh1's avatar
update  
xuxzh1 committed
1779
1780

        obj = obj->next;
xuxzh1's avatar
init  
xuxzh1 committed
1781
1782
    }

xuxzh1's avatar
update  
xuxzh1 committed
1783
    return NULL;
xuxzh1's avatar
init  
xuxzh1 committed
1784
1785
}

xuxzh1's avatar
update  
xuxzh1 committed
1786
1787
struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
    struct ggml_object * obj = ctx->objects_begin;
xuxzh1's avatar
init  
xuxzh1 committed
1788

xuxzh1's avatar
update  
xuxzh1 committed
1789
    char * const mem_buffer = ctx->mem_buffer;
xuxzh1's avatar
init  
xuxzh1 committed
1790

xuxzh1's avatar
update  
xuxzh1 committed
1791
1792
1793
1794
1795
1796
1797
    while (obj != NULL) {
        if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
            struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
            if (strcmp(cur->name, name) == 0) {
                return cur;
            }
        }
xuxzh1's avatar
init  
xuxzh1 committed
1798

xuxzh1's avatar
update  
xuxzh1 committed
1799
1800
        obj = obj->next;
    }
xuxzh1's avatar
init  
xuxzh1 committed
1801

xuxzh1's avatar
update  
xuxzh1 committed
1802
    return NULL;
xuxzh1's avatar
init  
xuxzh1 committed
1803
1804
}

xuxzh1's avatar
update  
xuxzh1 committed
1805
////////////////////////////////////////////////////////////////////////////////
xuxzh1's avatar
init  
xuxzh1 committed
1806

xuxzh1's avatar
update  
xuxzh1 committed
1807
// ggml_dup
xuxzh1's avatar
init  
xuxzh1 committed
1808

xuxzh1's avatar
update  
xuxzh1 committed
1809
1810
1811
1812
1813
static struct ggml_tensor * ggml_dup_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        bool                  inplace) {
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
1814

xuxzh1's avatar
update  
xuxzh1 committed
1815
1816
    result->op     = GGML_OP_DUP;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
1817

xuxzh1's avatar
update  
xuxzh1 committed
1818
    return result;
xuxzh1's avatar
init  
xuxzh1 committed
1819
1820
}

xuxzh1's avatar
update  
xuxzh1 committed
1821
1822
1823
1824
struct ggml_tensor * ggml_dup(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_dup_impl(ctx, a, false);
xuxzh1's avatar
init  
xuxzh1 committed
1825
1826
}

xuxzh1's avatar
update  
xuxzh1 committed
1827
1828
1829
1830
struct ggml_tensor * ggml_dup_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_dup_impl(ctx, a, true);
xuxzh1's avatar
init  
xuxzh1 committed
1831
1832
}

xuxzh1's avatar
update  
xuxzh1 committed
1833
// ggml_add
xuxzh1's avatar
init  
xuxzh1 committed
1834

xuxzh1's avatar
update  
xuxzh1 committed
1835
1836
1837
1838
1839
1840
static struct ggml_tensor * ggml_add_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        bool                  inplace) {
    GGML_ASSERT(ggml_can_repeat(b, a));
xuxzh1's avatar
init  
xuxzh1 committed
1841

xuxzh1's avatar
update  
xuxzh1 committed
1842
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
1843

xuxzh1's avatar
update  
xuxzh1 committed
1844
1845
1846
    result->op     = GGML_OP_ADD;
    result->src[0] = a;
    result->src[1] = b;
xuxzh1's avatar
init  
xuxzh1 committed
1847

xuxzh1's avatar
update  
xuxzh1 committed
1848
1849
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
1850

xuxzh1's avatar
update  
xuxzh1 committed
1851
1852
1853
1854
1855
struct ggml_tensor * ggml_add(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_add_impl(ctx, a, b, false);
xuxzh1's avatar
init  
xuxzh1 committed
1856
1857
}

xuxzh1's avatar
update  
xuxzh1 committed
1858
1859
1860
1861
1862
struct ggml_tensor * ggml_add_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_add_impl(ctx, a, b, true);
xuxzh1's avatar
init  
xuxzh1 committed
1863
1864
}

xuxzh1's avatar
update  
xuxzh1 committed
1865
// ggml_add_cast
xuxzh1's avatar
init  
xuxzh1 committed
1866

xuxzh1's avatar
update  
xuxzh1 committed
1867
1868
1869
1870
1871
1872
1873
1874
static struct ggml_tensor * ggml_add_cast_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        enum   ggml_type      type) {
    // TODO: support less-strict constraint
    //       GGML_ASSERT(ggml_can_repeat(b, a));
    GGML_ASSERT(ggml_can_repeat_rows(b, a));
xuxzh1's avatar
init  
xuxzh1 committed
1875

xuxzh1's avatar
update  
xuxzh1 committed
1876
1877
1878
1879
    // currently only supported for quantized input and f16
    GGML_ASSERT(ggml_is_quantized(a->type) ||
                a->type == GGML_TYPE_F16 ||
                a->type == GGML_TYPE_BF16);
xuxzh1's avatar
init  
xuxzh1 committed
1880

xuxzh1's avatar
update  
xuxzh1 committed
1881
    struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
xuxzh1's avatar
init  
xuxzh1 committed
1882

xuxzh1's avatar
update  
xuxzh1 committed
1883
1884
1885
    result->op     = GGML_OP_ADD;
    result->src[0] = a;
    result->src[1] = b;
xuxzh1's avatar
init  
xuxzh1 committed
1886

xuxzh1's avatar
update  
xuxzh1 committed
1887
1888
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
1889

xuxzh1's avatar
update  
xuxzh1 committed
1890
1891
1892
1893
1894
1895
1896
struct ggml_tensor * ggml_add_cast(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        enum   ggml_type      type) {
    return ggml_add_cast_impl(ctx, a, b, type);
}
xuxzh1's avatar
init  
xuxzh1 committed
1897

xuxzh1's avatar
update  
xuxzh1 committed
1898
// ggml_add1
xuxzh1's avatar
init  
xuxzh1 committed
1899

xuxzh1's avatar
update  
xuxzh1 committed
1900
1901
1902
1903
1904
1905
1906
static struct ggml_tensor * ggml_add1_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        bool                  inplace) {
    GGML_ASSERT(ggml_is_scalar(b));
    GGML_ASSERT(ggml_is_padded_1d(a));
xuxzh1's avatar
init  
xuxzh1 committed
1907

xuxzh1's avatar
update  
xuxzh1 committed
1908
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
1909

xuxzh1's avatar
update  
xuxzh1 committed
1910
1911
1912
    result->op     = GGML_OP_ADD1;
    result->src[0] = a;
    result->src[1] = b;
xuxzh1's avatar
init  
xuxzh1 committed
1913

xuxzh1's avatar
update  
xuxzh1 committed
1914
    return result;
xuxzh1's avatar
init  
xuxzh1 committed
1915
1916
}

xuxzh1's avatar
update  
xuxzh1 committed
1917
1918
1919
1920
1921
struct ggml_tensor * ggml_add1(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_add1_impl(ctx, a, b, false);
xuxzh1's avatar
init  
xuxzh1 committed
1922
1923
}

xuxzh1's avatar
update  
xuxzh1 committed
1924
1925
1926
1927
1928
struct ggml_tensor * ggml_add1_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_add1_impl(ctx, a, b, true);
xuxzh1's avatar
init  
xuxzh1 committed
1929
1930
}

xuxzh1's avatar
update  
xuxzh1 committed
1931
// ggml_acc
xuxzh1's avatar
init  
xuxzh1 committed
1932

xuxzh1's avatar
update  
xuxzh1 committed
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
static struct ggml_tensor * ggml_acc_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        size_t                nb1,
        size_t                nb2,
        size_t                nb3,
        size_t                offset,
        bool                  inplace) {
    GGML_ASSERT(ggml_nelements(b) <= ggml_nelements(a));
    GGML_ASSERT(ggml_is_contiguous(a));
    GGML_ASSERT(a->type == GGML_TYPE_F32);
    GGML_ASSERT(b->type == GGML_TYPE_F32);
xuxzh1's avatar
init  
xuxzh1 committed
1946

xuxzh1's avatar
update  
xuxzh1 committed
1947
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
1948

xuxzh1's avatar
update  
xuxzh1 committed
1949
1950
    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
    ggml_set_op_params(result, params, sizeof(params));
xuxzh1's avatar
init  
xuxzh1 committed
1951

xuxzh1's avatar
update  
xuxzh1 committed
1952
1953
1954
    result->op     = GGML_OP_ACC;
    result->src[0] = a;
    result->src[1] = b;
xuxzh1's avatar
init  
xuxzh1 committed
1955

xuxzh1's avatar
update  
xuxzh1 committed
1956
    return result;
xuxzh1's avatar
init  
xuxzh1 committed
1957
1958
}

xuxzh1's avatar
update  
xuxzh1 committed
1959
1960
1961
1962
1963
1964
1965
1966
1967
struct ggml_tensor * ggml_acc(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        size_t                nb1,
        size_t                nb2,
        size_t                nb3,
        size_t                offset) {
    return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
xuxzh1's avatar
init  
xuxzh1 committed
1968
1969
}

xuxzh1's avatar
update  
xuxzh1 committed
1970
1971
1972
1973
1974
1975
1976
1977
1978
struct ggml_tensor * ggml_acc_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        size_t                nb1,
        size_t                nb2,
        size_t                nb3,
        size_t                offset) {
    return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
xuxzh1's avatar
init  
xuxzh1 committed
1979
1980
}

xuxzh1's avatar
update  
xuxzh1 committed
1981
// ggml_sub
xuxzh1's avatar
init  
xuxzh1 committed
1982

xuxzh1's avatar
update  
xuxzh1 committed
1983
1984
1985
1986
1987
1988
static struct ggml_tensor * ggml_sub_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        bool                  inplace) {
    GGML_ASSERT(ggml_can_repeat(b, a));
xuxzh1's avatar
init  
xuxzh1 committed
1989

xuxzh1's avatar
update  
xuxzh1 committed
1990
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
1991

xuxzh1's avatar
update  
xuxzh1 committed
1992
1993
1994
    result->op     = GGML_OP_SUB;
    result->src[0] = a;
    result->src[1] = b;
xuxzh1's avatar
init  
xuxzh1 committed
1995

xuxzh1's avatar
update  
xuxzh1 committed
1996
    return result;
xuxzh1's avatar
init  
xuxzh1 committed
1997
1998
}

xuxzh1's avatar
update  
xuxzh1 committed
1999
2000
2001
2002
2003
struct ggml_tensor * ggml_sub(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_sub_impl(ctx, a, b, false);
xuxzh1's avatar
init  
xuxzh1 committed
2004
2005
}

xuxzh1's avatar
update  
xuxzh1 committed
2006
2007
2008
2009
2010
struct ggml_tensor * ggml_sub_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_sub_impl(ctx, a, b, true);
xuxzh1's avatar
init  
xuxzh1 committed
2011
2012
}

xuxzh1's avatar
update  
xuxzh1 committed
2013
// ggml_mul
xuxzh1's avatar
init  
xuxzh1 committed
2014

xuxzh1's avatar
update  
xuxzh1 committed
2015
2016
2017
2018
2019
2020
static struct ggml_tensor * ggml_mul_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        bool                  inplace) {
    GGML_ASSERT(ggml_can_repeat(b, a));
xuxzh1's avatar
init  
xuxzh1 committed
2021

xuxzh1's avatar
update  
xuxzh1 committed
2022
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
2023

xuxzh1's avatar
update  
xuxzh1 committed
2024
2025
2026
    result->op     = GGML_OP_MUL;
    result->src[0] = a;
    result->src[1] = b;
xuxzh1's avatar
init  
xuxzh1 committed
2027

xuxzh1's avatar
update  
xuxzh1 committed
2028
2029
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
2030

xuxzh1's avatar
update  
xuxzh1 committed
2031
2032
2033
2034
2035
2036
struct ggml_tensor * ggml_mul(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_mul_impl(ctx, a, b, false);
}
xuxzh1's avatar
init  
xuxzh1 committed
2037

xuxzh1's avatar
update  
xuxzh1 committed
2038
2039
2040
2041
2042
2043
struct ggml_tensor * ggml_mul_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_mul_impl(ctx, a, b, true);
}
xuxzh1's avatar
init  
xuxzh1 committed
2044

xuxzh1's avatar
update  
xuxzh1 committed
2045
// ggml_div
xuxzh1's avatar
init  
xuxzh1 committed
2046

xuxzh1's avatar
update  
xuxzh1 committed
2047
2048
2049
2050
2051
2052
static struct ggml_tensor * ggml_div_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        bool                  inplace) {
    GGML_ASSERT(ggml_can_repeat(b, a));
xuxzh1's avatar
init  
xuxzh1 committed
2053

xuxzh1's avatar
update  
xuxzh1 committed
2054
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
2055

xuxzh1's avatar
update  
xuxzh1 committed
2056
2057
2058
    result->op     = GGML_OP_DIV;
    result->src[0] = a;
    result->src[1] = b;
xuxzh1's avatar
init  
xuxzh1 committed
2059

xuxzh1's avatar
update  
xuxzh1 committed
2060
2061
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
2062

xuxzh1's avatar
update  
xuxzh1 committed
2063
2064
2065
2066
2067
2068
struct ggml_tensor * ggml_div(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_div_impl(ctx, a, b, false);
}
xuxzh1's avatar
init  
xuxzh1 committed
2069

xuxzh1's avatar
update  
xuxzh1 committed
2070
2071
2072
2073
2074
2075
struct ggml_tensor * ggml_div_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_div_impl(ctx, a, b, true);
}
xuxzh1's avatar
init  
xuxzh1 committed
2076

xuxzh1's avatar
update  
xuxzh1 committed
2077
// ggml_sqr
xuxzh1's avatar
init  
xuxzh1 committed
2078

xuxzh1's avatar
update  
xuxzh1 committed
2079
2080
2081
2082
2083
static struct ggml_tensor * ggml_sqr_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        bool                  inplace) {
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
2084

xuxzh1's avatar
update  
xuxzh1 committed
2085
2086
    result->op     = GGML_OP_SQR;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
2087

xuxzh1's avatar
update  
xuxzh1 committed
2088
2089
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
2090

xuxzh1's avatar
update  
xuxzh1 committed
2091
2092
2093
2094
2095
struct ggml_tensor * ggml_sqr(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_sqr_impl(ctx, a, false);
}
xuxzh1's avatar
init  
xuxzh1 committed
2096

xuxzh1's avatar
update  
xuxzh1 committed
2097
2098
2099
2100
2101
struct ggml_tensor * ggml_sqr_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_sqr_impl(ctx, a, true);
}
xuxzh1's avatar
init  
xuxzh1 committed
2102

xuxzh1's avatar
update  
xuxzh1 committed
2103
// ggml_sqrt
xuxzh1's avatar
init  
xuxzh1 committed
2104

xuxzh1's avatar
update  
xuxzh1 committed
2105
2106
2107
2108
2109
static struct ggml_tensor * ggml_sqrt_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        bool                  inplace) {
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
2110

xuxzh1's avatar
update  
xuxzh1 committed
2111
2112
    result->op     = GGML_OP_SQRT;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
2113

xuxzh1's avatar
update  
xuxzh1 committed
2114
2115
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
2116

xuxzh1's avatar
update  
xuxzh1 committed
2117
2118
2119
2120
2121
struct ggml_tensor * ggml_sqrt(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_sqrt_impl(ctx, a, false);
}
xuxzh1's avatar
init  
xuxzh1 committed
2122

xuxzh1's avatar
update  
xuxzh1 committed
2123
2124
2125
2126
2127
struct ggml_tensor * ggml_sqrt_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_sqrt_impl(ctx, a, true);
}
xuxzh1's avatar
init  
xuxzh1 committed
2128

xuxzh1's avatar
update  
xuxzh1 committed
2129
// ggml_log
xuxzh1's avatar
init  
xuxzh1 committed
2130

xuxzh1's avatar
update  
xuxzh1 committed
2131
2132
2133
2134
2135
static struct ggml_tensor * ggml_log_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        bool                  inplace) {
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
2136

xuxzh1's avatar
update  
xuxzh1 committed
2137
2138
    result->op     = GGML_OP_LOG;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
2139

xuxzh1's avatar
update  
xuxzh1 committed
2140
2141
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
2142

xuxzh1's avatar
update  
xuxzh1 committed
2143
2144
2145
2146
2147
struct ggml_tensor * ggml_log(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_log_impl(ctx, a, false);
}
xuxzh1's avatar
init  
xuxzh1 committed
2148

xuxzh1's avatar
update  
xuxzh1 committed
2149
2150
2151
2152
2153
struct ggml_tensor * ggml_log_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_log_impl(ctx, a, true);
}
xuxzh1's avatar
init  
xuxzh1 committed
2154

xuxzh1's avatar
update  
xuxzh1 committed
2155
// ggml_sin
xuxzh1's avatar
init  
xuxzh1 committed
2156

xuxzh1's avatar
update  
xuxzh1 committed
2157
2158
2159
2160
2161
static struct ggml_tensor * ggml_sin_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        bool                  inplace) {
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
2162

xuxzh1's avatar
update  
xuxzh1 committed
2163
2164
    result->op     = GGML_OP_SIN;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
2165

xuxzh1's avatar
update  
xuxzh1 committed
2166
2167
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
2168

xuxzh1's avatar
update  
xuxzh1 committed
2169
2170
2171
2172
struct ggml_tensor * ggml_sin(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_sin_impl(ctx, a, false);
xuxzh1's avatar
init  
xuxzh1 committed
2173
2174
}

xuxzh1's avatar
update  
xuxzh1 committed
2175
2176
2177
2178
struct ggml_tensor * ggml_sin_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_sin_impl(ctx, a, true);
xuxzh1's avatar
init  
xuxzh1 committed
2179
2180
}

xuxzh1's avatar
update  
xuxzh1 committed
2181
2182
2183
2184
2185
2186
2187
// ggml_cos

static struct ggml_tensor * ggml_cos_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        bool                  inplace) {
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
2188

xuxzh1's avatar
update  
xuxzh1 committed
2189
2190
    result->op     = GGML_OP_COS;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
2191
2192
2193
2194

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
2195
2196
2197
2198
struct ggml_tensor * ggml_cos(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_cos_impl(ctx, a, false);
xuxzh1's avatar
init  
xuxzh1 committed
2199
2200
}

xuxzh1's avatar
update  
xuxzh1 committed
2201
2202
2203
2204
struct ggml_tensor * ggml_cos_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_cos_impl(ctx, a, true);
xuxzh1's avatar
init  
xuxzh1 committed
2205
2206
}

xuxzh1's avatar
update  
xuxzh1 committed
2207
// ggml_sum
xuxzh1's avatar
init  
xuxzh1 committed
2208

xuxzh1's avatar
update  
xuxzh1 committed
2209
2210
2211
2212
2213
2214
2215
2216
2217
struct ggml_tensor * ggml_sum(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);

    result->op     = GGML_OP_SUM;
    result->src[0] = a;

    return result;
xuxzh1's avatar
init  
xuxzh1 committed
2218
2219
}

xuxzh1's avatar
update  
xuxzh1 committed
2220
// ggml_sum_rows
xuxzh1's avatar
init  
xuxzh1 committed
2221

xuxzh1's avatar
update  
xuxzh1 committed
2222
2223
2224
2225
2226
2227
struct ggml_tensor * ggml_sum_rows(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    int64_t ne[GGML_MAX_DIMS] = { 1 };
    for (int i = 1; i < GGML_MAX_DIMS; ++i) {
        ne[i] = a->ne[i];
xuxzh1's avatar
init  
xuxzh1 committed
2228
2229
    }

xuxzh1's avatar
update  
xuxzh1 committed
2230
    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
xuxzh1's avatar
init  
xuxzh1 committed
2231

xuxzh1's avatar
update  
xuxzh1 committed
2232
2233
    result->op     = GGML_OP_SUM_ROWS;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
2234

xuxzh1's avatar
update  
xuxzh1 committed
2235
    return result;
xuxzh1's avatar
init  
xuxzh1 committed
2236
2237
}

xuxzh1's avatar
update  
xuxzh1 committed
2238
// ggml_mean
xuxzh1's avatar
init  
xuxzh1 committed
2239

xuxzh1's avatar
update  
xuxzh1 committed
2240
2241
2242
2243
2244
struct ggml_tensor * ggml_mean(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
xuxzh1's avatar
init  
xuxzh1 committed
2245

xuxzh1's avatar
update  
xuxzh1 committed
2246
2247
    result->op     = GGML_OP_MEAN;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
2248

xuxzh1's avatar
update  
xuxzh1 committed
2249
2250
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
2251

xuxzh1's avatar
update  
xuxzh1 committed
2252
// ggml_argmax
xuxzh1's avatar
init  
xuxzh1 committed
2253

xuxzh1's avatar
update  
xuxzh1 committed
2254
2255
2256
2257
2258
struct ggml_tensor * ggml_argmax(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    GGML_ASSERT(ggml_is_matrix(a));
    GGML_ASSERT(a->ne[0] <= INT32_MAX);
xuxzh1's avatar
init  
xuxzh1 committed
2259

xuxzh1's avatar
update  
xuxzh1 committed
2260
    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
xuxzh1's avatar
init  
xuxzh1 committed
2261

xuxzh1's avatar
update  
xuxzh1 committed
2262
2263
    result->op     = GGML_OP_ARGMAX;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
2264

xuxzh1's avatar
update  
xuxzh1 committed
2265
2266
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
2267

xuxzh1's avatar
update  
xuxzh1 committed
2268
// ggml_count_equal
xuxzh1's avatar
init  
xuxzh1 committed
2269

xuxzh1's avatar
update  
xuxzh1 committed
2270
2271
2272
2273
2274
struct ggml_tensor * ggml_count_equal(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    GGML_ASSERT(ggml_are_same_shape(a, b));
xuxzh1's avatar
init  
xuxzh1 committed
2275

xuxzh1's avatar
update  
xuxzh1 committed
2276
    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, 1);
xuxzh1's avatar
init  
xuxzh1 committed
2277

xuxzh1's avatar
update  
xuxzh1 committed
2278
2279
2280
    result->op     = GGML_OP_COUNT_EQUAL;
    result->src[0] = a;
    result->src[1] = b;
xuxzh1's avatar
init  
xuxzh1 committed
2281

xuxzh1's avatar
update  
xuxzh1 committed
2282
    return result;
xuxzh1's avatar
init  
xuxzh1 committed
2283
2284
}

xuxzh1's avatar
update  
xuxzh1 committed
2285
2286
2287
// ggml_repeat

struct ggml_tensor * ggml_repeat(
xuxzh1's avatar
init  
xuxzh1 committed
2288
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
2289
2290
2291
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    GGML_ASSERT(ggml_can_repeat(a, b));
xuxzh1's avatar
init  
xuxzh1 committed
2292

xuxzh1's avatar
update  
xuxzh1 committed
2293
    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
xuxzh1's avatar
init  
xuxzh1 committed
2294

xuxzh1's avatar
update  
xuxzh1 committed
2295
2296
    result->op     = GGML_OP_REPEAT;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
2297

xuxzh1's avatar
update  
xuxzh1 committed
2298
2299
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
2300

xuxzh1's avatar
update  
xuxzh1 committed
2301
// ggml_repeat_back
xuxzh1's avatar
init  
xuxzh1 committed
2302

xuxzh1's avatar
update  
xuxzh1 committed
2303
2304
2305
2306
2307
struct ggml_tensor * ggml_repeat_back(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    GGML_ASSERT(ggml_can_repeat(b, a));
xuxzh1's avatar
init  
xuxzh1 committed
2308

xuxzh1's avatar
update  
xuxzh1 committed
2309
    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
xuxzh1's avatar
init  
xuxzh1 committed
2310

xuxzh1's avatar
update  
xuxzh1 committed
2311
2312
    result->op     = GGML_OP_REPEAT_BACK;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
2313

xuxzh1's avatar
update  
xuxzh1 committed
2314
2315
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
2316

xuxzh1's avatar
update  
xuxzh1 committed
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
// ggml_concat

struct ggml_tensor * ggml_concat(
    struct ggml_context * ctx,
    struct ggml_tensor  * a,
    struct ggml_tensor  * b,
    int                   dim) {
    GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);

    int64_t ne[GGML_MAX_DIMS];
    for (int d = 0; d < GGML_MAX_DIMS; ++d) {
        if (d == dim) {
            ne[d] = a->ne[d] + b->ne[d];
            continue;
xuxzh1's avatar
init  
xuxzh1 committed
2331
        }
xuxzh1's avatar
update  
xuxzh1 committed
2332
2333
        GGML_ASSERT(a->ne[d] == b->ne[d]);
        ne[d] = a->ne[d];
xuxzh1's avatar
init  
xuxzh1 committed
2334
2335
    }

xuxzh1's avatar
update  
xuxzh1 committed
2336
    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
xuxzh1's avatar
init  
xuxzh1 committed
2337

xuxzh1's avatar
update  
xuxzh1 committed
2338
    ggml_set_op_params_i32(result, 0, dim);
xuxzh1's avatar
init  
xuxzh1 committed
2339

xuxzh1's avatar
update  
xuxzh1 committed
2340
2341
2342
    result->op     = GGML_OP_CONCAT;
    result->src[0] = a;
    result->src[1] = b;
xuxzh1's avatar
init  
xuxzh1 committed
2343

xuxzh1's avatar
update  
xuxzh1 committed
2344
2345
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
2346

xuxzh1's avatar
update  
xuxzh1 committed
2347
// ggml_abs
xuxzh1's avatar
init  
xuxzh1 committed
2348

xuxzh1's avatar
update  
xuxzh1 committed
2349
2350
2351
2352
2353
struct ggml_tensor * ggml_abs(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_ABS);
}
xuxzh1's avatar
init  
xuxzh1 committed
2354

xuxzh1's avatar
update  
xuxzh1 committed
2355
2356
2357
2358
2359
struct ggml_tensor * ggml_abs_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ABS);
}
xuxzh1's avatar
init  
xuxzh1 committed
2360

xuxzh1's avatar
update  
xuxzh1 committed
2361
// ggml_sgn
xuxzh1's avatar
init  
xuxzh1 committed
2362

xuxzh1's avatar
update  
xuxzh1 committed
2363
2364
2365
2366
2367
struct ggml_tensor * ggml_sgn(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_SGN);
}
xuxzh1's avatar
init  
xuxzh1 committed
2368

xuxzh1's avatar
update  
xuxzh1 committed
2369
2370
2371
2372
2373
struct ggml_tensor * ggml_sgn_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SGN);
}
xuxzh1's avatar
init  
xuxzh1 committed
2374

xuxzh1's avatar
update  
xuxzh1 committed
2375
2376
2377
2378
2379
2380
// ggml_neg

struct ggml_tensor * ggml_neg(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_NEG);
xuxzh1's avatar
init  
xuxzh1 committed
2381
2382
}

xuxzh1's avatar
update  
xuxzh1 committed
2383
struct ggml_tensor * ggml_neg_inplace(
xuxzh1's avatar
init  
xuxzh1 committed
2384
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
2385
2386
        struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_NEG);
xuxzh1's avatar
init  
xuxzh1 committed
2387
2388
}

xuxzh1's avatar
update  
xuxzh1 committed
2389
2390
2391
// ggml_step

struct ggml_tensor * ggml_step(
xuxzh1's avatar
init  
xuxzh1 committed
2392
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
2393
2394
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_STEP);
xuxzh1's avatar
init  
xuxzh1 committed
2395
2396
}

xuxzh1's avatar
update  
xuxzh1 committed
2397
struct ggml_tensor * ggml_step_inplace(
xuxzh1's avatar
init  
xuxzh1 committed
2398
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
2399
2400
        struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_STEP);
xuxzh1's avatar
init  
xuxzh1 committed
2401
2402
}

xuxzh1's avatar
update  
xuxzh1 committed
2403
2404
2405
// ggml_tanh

struct ggml_tensor * ggml_tanh(
xuxzh1's avatar
init  
xuxzh1 committed
2406
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
2407
2408
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_TANH);
xuxzh1's avatar
init  
xuxzh1 committed
2409
2410
}

xuxzh1's avatar
update  
xuxzh1 committed
2411
struct ggml_tensor * ggml_tanh_inplace(
xuxzh1's avatar
init  
xuxzh1 committed
2412
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
2413
2414
        struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_TANH);
xuxzh1's avatar
init  
xuxzh1 committed
2415
2416
}

xuxzh1's avatar
update  
xuxzh1 committed
2417
// ggml_elu
xuxzh1's avatar
init  
xuxzh1 committed
2418

xuxzh1's avatar
update  
xuxzh1 committed
2419
2420
2421
2422
2423
struct ggml_tensor * ggml_elu(
    struct ggml_context * ctx,
    struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_ELU);
}
xuxzh1's avatar
init  
xuxzh1 committed
2424

xuxzh1's avatar
update  
xuxzh1 committed
2425
2426
2427
2428
2429
struct ggml_tensor * ggml_elu_inplace(
    struct ggml_context * ctx,
    struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ELU);
}
xuxzh1's avatar
init  
xuxzh1 committed
2430

xuxzh1's avatar
update  
xuxzh1 committed
2431
// ggml_relu
xuxzh1's avatar
init  
xuxzh1 committed
2432

xuxzh1's avatar
update  
xuxzh1 committed
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
struct ggml_tensor * ggml_relu(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_RELU);
}

struct ggml_tensor * ggml_relu_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
xuxzh1's avatar
init  
xuxzh1 committed
2443
2444
}

xuxzh1's avatar
update  
xuxzh1 committed
2445
// ggml_leaky_relu
xuxzh1's avatar
init  
xuxzh1 committed
2446

xuxzh1's avatar
update  
xuxzh1 committed
2447
2448
2449
2450
2451
2452
struct ggml_tensor * ggml_leaky_relu(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        float                 negative_slope,
        bool                  inplace) {
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
2453

xuxzh1's avatar
update  
xuxzh1 committed
2454
    ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
xuxzh1's avatar
init  
xuxzh1 committed
2455

xuxzh1's avatar
update  
xuxzh1 committed
2456
2457
    result->op     = GGML_OP_LEAKY_RELU;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
2458
2459
2460
2461

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
2462
2463
2464
2465
2466
2467
// ggml_sigmoid

struct ggml_tensor * ggml_sigmoid(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_SIGMOID);
xuxzh1's avatar
init  
xuxzh1 committed
2468
2469
}

xuxzh1's avatar
update  
xuxzh1 committed
2470
2471
2472
2473
struct ggml_tensor * ggml_sigmoid_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SIGMOID);
xuxzh1's avatar
init  
xuxzh1 committed
2474
2475
}

xuxzh1's avatar
update  
xuxzh1 committed
2476
2477
2478
2479
2480
2481
// ggml_gelu

struct ggml_tensor * ggml_gelu(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_GELU);
xuxzh1's avatar
init  
xuxzh1 committed
2482
2483
}

xuxzh1's avatar
update  
xuxzh1 committed
2484
2485
2486
2487
struct ggml_tensor * ggml_gelu_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU);
xuxzh1's avatar
init  
xuxzh1 committed
2488
2489
}

xuxzh1's avatar
update  
xuxzh1 committed
2490
2491
2492
2493
2494
2495
// ggml_gelu_quick

struct ggml_tensor * ggml_gelu_quick(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_QUICK);
xuxzh1's avatar
init  
xuxzh1 committed
2496
2497
}

xuxzh1's avatar
update  
xuxzh1 committed
2498
2499
2500
2501
struct ggml_tensor * ggml_gelu_quick_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_QUICK);
xuxzh1's avatar
init  
xuxzh1 committed
2502
2503
}

xuxzh1's avatar
update  
xuxzh1 committed
2504
2505
2506
2507
2508
2509
// ggml_silu

struct ggml_tensor * ggml_silu(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_SILU);
xuxzh1's avatar
init  
xuxzh1 committed
2510
2511
}

xuxzh1's avatar
update  
xuxzh1 committed
2512
2513
2514
2515
2516
struct ggml_tensor * ggml_silu_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SILU);
}
xuxzh1's avatar
init  
xuxzh1 committed
2517

xuxzh1's avatar
update  
xuxzh1 committed
2518
// ggml_silu_back
xuxzh1's avatar
init  
xuxzh1 committed
2519

xuxzh1's avatar
update  
xuxzh1 committed
2520
2521
2522
2523
2524
struct ggml_tensor * ggml_silu_back(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
2525

xuxzh1's avatar
update  
xuxzh1 committed
2526
2527
2528
2529
2530
    result->op     = GGML_OP_SILU_BACK;
    result->src[0] = a;
    result->src[1] = b;

    return result;
xuxzh1's avatar
init  
xuxzh1 committed
2531
2532
}

xuxzh1's avatar
update  
xuxzh1 committed
2533
// ggml hardswish
xuxzh1's avatar
init  
xuxzh1 committed
2534

xuxzh1's avatar
update  
xuxzh1 committed
2535
2536
2537
2538
2539
struct ggml_tensor * ggml_hardswish(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH);
}
xuxzh1's avatar
init  
xuxzh1 committed
2540

xuxzh1's avatar
update  
xuxzh1 committed
2541
// ggml hardsigmoid
xuxzh1's avatar
init  
xuxzh1 committed
2542

xuxzh1's avatar
update  
xuxzh1 committed
2543
2544
2545
2546
struct ggml_tensor * ggml_hardsigmoid(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID);
xuxzh1's avatar
init  
xuxzh1 committed
2547
2548
}

xuxzh1's avatar
update  
xuxzh1 committed
2549
// ggml exp
xuxzh1's avatar
init  
xuxzh1 committed
2550

xuxzh1's avatar
update  
xuxzh1 committed
2551
2552
2553
2554
struct ggml_tensor * ggml_exp(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary(ctx, a, GGML_UNARY_OP_EXP);
xuxzh1's avatar
init  
xuxzh1 committed
2555
2556
}

xuxzh1's avatar
update  
xuxzh1 committed
2557
2558
2559
2560
struct ggml_tensor * ggml_exp_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_EXP);
xuxzh1's avatar
init  
xuxzh1 committed
2561
2562
}

xuxzh1's avatar
update  
xuxzh1 committed
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
// ggml_norm

static struct ggml_tensor * ggml_norm_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        float                 eps,
        bool                  inplace) {
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    ggml_set_op_params(result, &eps, sizeof(eps));

    result->op     = GGML_OP_NORM;
    result->src[0] = a;

    return result;
xuxzh1's avatar
init  
xuxzh1 committed
2578
2579
}

xuxzh1's avatar
update  
xuxzh1 committed
2580
2581
2582
2583
2584
struct ggml_tensor * ggml_norm(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        float                 eps) {
    return ggml_norm_impl(ctx, a, eps, false);
xuxzh1's avatar
init  
xuxzh1 committed
2585
2586
}

xuxzh1's avatar
update  
xuxzh1 committed
2587
2588
2589
2590
2591
struct ggml_tensor * ggml_norm_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        float                 eps) {
    return ggml_norm_impl(ctx, a, eps, true);
xuxzh1's avatar
init  
xuxzh1 committed
2592
2593
}

xuxzh1's avatar
update  
xuxzh1 committed
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
// ggml_rms_norm

static struct ggml_tensor * ggml_rms_norm_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        float                 eps,
        bool                  inplace) {
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    ggml_set_op_params(result, &eps, sizeof(eps));

    result->op     = GGML_OP_RMS_NORM;
    result->src[0] = a;

    return result;
xuxzh1's avatar
init  
xuxzh1 committed
2609
2610
}

xuxzh1's avatar
update  
xuxzh1 committed
2611
2612
2613
2614
2615
struct ggml_tensor * ggml_rms_norm(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        float                 eps) {
    return ggml_rms_norm_impl(ctx, a, eps, false);
xuxzh1's avatar
init  
xuxzh1 committed
2616
2617
}

xuxzh1's avatar
update  
xuxzh1 committed
2618
2619
2620
2621
2622
struct ggml_tensor * ggml_rms_norm_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        float                 eps) {
    return ggml_rms_norm_impl(ctx, a, eps, true);
xuxzh1's avatar
init  
xuxzh1 committed
2623
2624
}

xuxzh1's avatar
update  
xuxzh1 committed
2625
// ggml_rms_norm_back
xuxzh1's avatar
init  
xuxzh1 committed
2626

xuxzh1's avatar
update  
xuxzh1 committed
2627
2628
2629
2630
2631
2632
struct ggml_tensor * ggml_rms_norm_back(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        float                 eps) {
    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
2633

xuxzh1's avatar
update  
xuxzh1 committed
2634
2635
2636
2637
2638
2639
2640
    ggml_set_op_params(result, &eps, sizeof(eps));

    result->op     = GGML_OP_RMS_NORM_BACK;
    result->src[0] = a;
    result->src[1] = b;

    return result;
xuxzh1's avatar
init  
xuxzh1 committed
2641
2642
}

xuxzh1's avatar
update  
xuxzh1 committed
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
// ggml_group_norm

static struct ggml_tensor * ggml_group_norm_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   n_groups,
        float                 eps,
        bool                  inplace) {
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

    ggml_set_op_params_i32(result, 0, n_groups);
    ggml_set_op_params_f32(result, 1, eps);

    result->op     = GGML_OP_GROUP_NORM;
    result->src[0] = a;

    return result;
xuxzh1's avatar
init  
xuxzh1 committed
2660
2661
}

xuxzh1's avatar
update  
xuxzh1 committed
2662
2663
2664
2665
2666
2667
struct ggml_tensor * ggml_group_norm(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   n_groups,
        float                 eps) {
    return ggml_group_norm_impl(ctx, a, n_groups, eps, false);
xuxzh1's avatar
init  
xuxzh1 committed
2668
2669
}

xuxzh1's avatar
update  
xuxzh1 committed
2670
2671
2672
2673
2674
2675
struct ggml_tensor * ggml_group_norm_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   n_groups,
        float                 eps) {
    return ggml_group_norm_impl(ctx, a, n_groups, eps, true);
xuxzh1's avatar
init  
xuxzh1 committed
2676
2677
}

xuxzh1's avatar
update  
xuxzh1 committed
2678
2679
2680
2681
2682
2683
2684
2685
// ggml_mul_mat

static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

    return (t0->ne[0]           == t1->ne[0])  &&
           (t1->ne[2]%t0->ne[2] == 0)          && // verify t0 is broadcastable
           (t1->ne[3]%t0->ne[3] == 0);
xuxzh1's avatar
init  
xuxzh1 committed
2686
2687
}

xuxzh1's avatar
update  
xuxzh1 committed
2688
struct ggml_tensor * ggml_mul_mat(
xuxzh1's avatar
init  
xuxzh1 committed
2689
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
2690
2691
2692
2693
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    GGML_ASSERT(ggml_can_mul_mat(a, b));
    GGML_ASSERT(!ggml_is_transposed(a));
xuxzh1's avatar
init  
xuxzh1 committed
2694

xuxzh1's avatar
update  
xuxzh1 committed
2695
2696
2697
2698
2699
2700
    const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);

    result->op     = GGML_OP_MUL_MAT;
    result->src[0] = a;
    result->src[1] = b;
xuxzh1's avatar
init  
xuxzh1 committed
2701
2702
2703
2704

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
2705
2706
2707
2708
void ggml_mul_mat_set_prec(
        struct ggml_tensor * a,
        enum ggml_prec       prec) {
    GGML_ASSERT(a->op == GGML_OP_MUL_MAT);
xuxzh1's avatar
init  
xuxzh1 committed
2709

xuxzh1's avatar
update  
xuxzh1 committed
2710
    const int32_t prec_i32 = (int32_t) prec;
xuxzh1's avatar
init  
xuxzh1 committed
2711

xuxzh1's avatar
update  
xuxzh1 committed
2712
2713
    ggml_set_op_params_i32(a, 0, prec_i32);
}
xuxzh1's avatar
init  
xuxzh1 committed
2714

xuxzh1's avatar
update  
xuxzh1 committed
2715
// ggml_mul_mat_id
xuxzh1's avatar
init  
xuxzh1 committed
2716

xuxzh1's avatar
update  
xuxzh1 committed
2717
2718
/*
    c = ggml_mul_mat_id(ctx, as, b, ids);
xuxzh1's avatar
init  
xuxzh1 committed
2719

xuxzh1's avatar
update  
xuxzh1 committed
2720
2721
2722
2723
    as  -> [cols, rows, n_expert]
    ids -> [n_experts_used, n_tokens] (i32)
    b   -> [cols, n_expert_used, n_tokens]
    c   -> [rows, n_expert_used, n_tokens]
xuxzh1's avatar
init  
xuxzh1 committed
2724

xuxzh1's avatar
update  
xuxzh1 committed
2725
    in b, n_experts_used can be broadcasted to match the n_expert_used of ids
xuxzh1's avatar
init  
xuxzh1 committed
2726

xuxzh1's avatar
update  
xuxzh1 committed
2727
2728
2729
2730
2731
2732
2733
2734
2735
    c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
*/
struct ggml_tensor * ggml_mul_mat_id(
        struct ggml_context * ctx,
        struct ggml_tensor  * as,
        struct ggml_tensor  * b,
        struct ggml_tensor  * ids) {
    GGML_ASSERT(!ggml_is_transposed(as));
    GGML_ASSERT(ids->type == GGML_TYPE_I32);
xuxzh1's avatar
init  
xuxzh1 committed
2736

xuxzh1's avatar
update  
xuxzh1 committed
2737
2738
2739
2740
2741
2742
    GGML_ASSERT(as->ne[3] == 1); // as is 3d (one matrix per expert)
    GGML_ASSERT(b->ne[3] == 1); // b is 3d
    GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
    GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row
    GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
    GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast
xuxzh1's avatar
init  
xuxzh1 committed
2743

xuxzh1's avatar
update  
xuxzh1 committed
2744
2745
    const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
xuxzh1's avatar
init  
xuxzh1 committed
2746

xuxzh1's avatar
update  
xuxzh1 committed
2747
2748
2749
2750
    result->op     = GGML_OP_MUL_MAT_ID;
    result->src[0] = as;
    result->src[1] = b;
    result->src[2] = ids;
xuxzh1's avatar
init  
xuxzh1 committed
2751

xuxzh1's avatar
update  
xuxzh1 committed
2752
2753
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
2754

xuxzh1's avatar
update  
xuxzh1 committed
2755
// ggml_out_prod
xuxzh1's avatar
init  
xuxzh1 committed
2756

xuxzh1's avatar
update  
xuxzh1 committed
2757
2758
static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
xuxzh1's avatar
init  
xuxzh1 committed
2759

xuxzh1's avatar
update  
xuxzh1 committed
2760
2761
2762
    return (t0->ne[1] == t1->ne[1])   &&
           (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
           (t1->ne[3]%t0->ne[3] == 0);
xuxzh1's avatar
init  
xuxzh1 committed
2763
2764
}

xuxzh1's avatar
update  
xuxzh1 committed
2765
2766
2767
2768
2769
2770
struct ggml_tensor * ggml_out_prod(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    GGML_ASSERT(ggml_can_out_prod(a, b));
    GGML_ASSERT(!ggml_is_transposed(a));
xuxzh1's avatar
init  
xuxzh1 committed
2771

xuxzh1's avatar
update  
xuxzh1 committed
2772
2773
2774
    // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
    const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
xuxzh1's avatar
init  
xuxzh1 committed
2775

xuxzh1's avatar
update  
xuxzh1 committed
2776
2777
2778
    result->op     = GGML_OP_OUT_PROD;
    result->src[0] = a;
    result->src[1] = b;
xuxzh1's avatar
init  
xuxzh1 committed
2779

xuxzh1's avatar
update  
xuxzh1 committed
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
    return result;
}

// ggml_scale

static struct ggml_tensor * ggml_scale_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        float                 s,
        bool                  inplace) {
    GGML_ASSERT(ggml_is_padded_1d(a));
xuxzh1's avatar
init  
xuxzh1 committed
2791
2792
2793

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

xuxzh1's avatar
update  
xuxzh1 committed
2794
2795
2796
    ggml_set_op_params(result, &s, sizeof(s));

    result->op     = GGML_OP_SCALE;
xuxzh1's avatar
init  
xuxzh1 committed
2797
2798
2799
2800
2801
    result->src[0] = a;

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
2802
struct ggml_tensor * ggml_scale(
xuxzh1's avatar
init  
xuxzh1 committed
2803
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
2804
2805
2806
        struct ggml_tensor  * a,
        float                 s) {
    return ggml_scale_impl(ctx, a, s, false);
xuxzh1's avatar
init  
xuxzh1 committed
2807
2808
}

xuxzh1's avatar
update  
xuxzh1 committed
2809
struct ggml_tensor * ggml_scale_inplace(
xuxzh1's avatar
init  
xuxzh1 committed
2810
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
2811
2812
2813
        struct ggml_tensor  * a,
        float                 s) {
    return ggml_scale_impl(ctx, a, s, true);
xuxzh1's avatar
init  
xuxzh1 committed
2814
2815
}

xuxzh1's avatar
update  
xuxzh1 committed
2816
// ggml_set
xuxzh1's avatar
init  
xuxzh1 committed
2817

xuxzh1's avatar
update  
xuxzh1 committed
2818
static struct ggml_tensor * ggml_set_impl(
xuxzh1's avatar
init  
xuxzh1 committed
2819
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
2820
2821
2822
2823
2824
2825
2826
2827
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        size_t                nb1,
        size_t                nb2,
        size_t                nb3,
        size_t                offset,
        bool                  inplace) {
    GGML_ASSERT(ggml_nelements(a) >= ggml_nelements(b));
xuxzh1's avatar
init  
xuxzh1 committed
2828

xuxzh1's avatar
update  
xuxzh1 committed
2829
    // make a view of the destination
xuxzh1's avatar
init  
xuxzh1 committed
2830
2831
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

xuxzh1's avatar
update  
xuxzh1 committed
2832
2833
2834
2835
2836
    GGML_ASSERT(offset < (size_t)(1 << 30));
    int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
    ggml_set_op_params(result, params, sizeof(params));

    result->op     = GGML_OP_SET;
xuxzh1's avatar
init  
xuxzh1 committed
2837
2838
2839
2840
2841
2842
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
2843
struct ggml_tensor * ggml_set(
xuxzh1's avatar
init  
xuxzh1 committed
2844
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
2845
2846
2847
2848
2849
2850
2851
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        size_t                nb1,
        size_t                nb2,
        size_t                nb3,
        size_t                offset) {
    return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
xuxzh1's avatar
init  
xuxzh1 committed
2852
2853
}

xuxzh1's avatar
update  
xuxzh1 committed
2854
struct ggml_tensor * ggml_set_inplace(
xuxzh1's avatar
init  
xuxzh1 committed
2855
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
2856
2857
2858
2859
2860
2861
2862
2863
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        size_t                nb1,
        size_t                nb2,
        size_t                nb3,
        size_t                offset) {
    return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
}
xuxzh1's avatar
init  
xuxzh1 committed
2864

xuxzh1's avatar
update  
xuxzh1 committed
2865
struct ggml_tensor * ggml_set_1d(
xuxzh1's avatar
init  
xuxzh1 committed
2866
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
2867
2868
2869
2870
2871
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        size_t                offset) {
    return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, false);
}
xuxzh1's avatar
init  
xuxzh1 committed
2872

xuxzh1's avatar
update  
xuxzh1 committed
2873
2874
2875
2876
2877
2878
2879
struct ggml_tensor * ggml_set_1d_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        size_t                offset) {
    return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, true);
}
xuxzh1's avatar
init  
xuxzh1 committed
2880

xuxzh1's avatar
update  
xuxzh1 committed
2881
2882
2883
2884
2885
2886
2887
struct ggml_tensor * ggml_set_2d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        size_t                nb1,
        size_t                offset) {
    return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
xuxzh1's avatar
init  
xuxzh1 committed
2888
2889
}

xuxzh1's avatar
update  
xuxzh1 committed
2890
struct ggml_tensor * ggml_set_2d_inplace(
xuxzh1's avatar
init  
xuxzh1 committed
2891
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
2892
2893
2894
2895
2896
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        size_t                nb1,
        size_t                offset) {
    return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
xuxzh1's avatar
init  
xuxzh1 committed
2897
2898
}

xuxzh1's avatar
update  
xuxzh1 committed
2899
// ggml_cpy
xuxzh1's avatar
init  
xuxzh1 committed
2900

xuxzh1's avatar
update  
xuxzh1 committed
2901
static struct ggml_tensor * ggml_cpy_impl(
xuxzh1's avatar
init  
xuxzh1 committed
2902
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
2903
2904
2905
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
xuxzh1's avatar
init  
xuxzh1 committed
2906

xuxzh1's avatar
update  
xuxzh1 committed
2907
2908
2909
2910
2911
2912
    // make a view of the destination
    struct ggml_tensor * result = ggml_view_tensor(ctx, b);
    if (strlen(b->name) > 0) {
        ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
    } else {
        ggml_format_name(result, "%s (copy)", a->name);
xuxzh1's avatar
init  
xuxzh1 committed
2913
2914
    }

xuxzh1's avatar
update  
xuxzh1 committed
2915
    result->op     = GGML_OP_CPY;
xuxzh1's avatar
init  
xuxzh1 committed
2916
2917
2918
2919
2920
2921
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
2922
struct ggml_tensor * ggml_cpy(
xuxzh1's avatar
init  
xuxzh1 committed
2923
2924
2925
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b) {
xuxzh1's avatar
update  
xuxzh1 committed
2926
    return ggml_cpy_impl(ctx, a, b);
xuxzh1's avatar
init  
xuxzh1 committed
2927
2928
}

xuxzh1's avatar
update  
xuxzh1 committed
2929
struct ggml_tensor * ggml_cast(
xuxzh1's avatar
init  
xuxzh1 committed
2930
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
2931
2932
2933
2934
        struct ggml_tensor  * a,
        enum   ggml_type      type) {
    struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
    ggml_format_name(result, "%s (copy)", a->name);
xuxzh1's avatar
init  
xuxzh1 committed
2935

xuxzh1's avatar
update  
xuxzh1 committed
2936
2937
2938
    result->op     = GGML_OP_CPY;
    result->src[0] = a;
    result->src[1] = result;
xuxzh1's avatar
init  
xuxzh1 committed
2939

xuxzh1's avatar
update  
xuxzh1 committed
2940
2941
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
2942

xuxzh1's avatar
update  
xuxzh1 committed
2943
// ggml_cont
xuxzh1's avatar
init  
xuxzh1 committed
2944

xuxzh1's avatar
update  
xuxzh1 committed
2945
2946
2947
2948
2949
static struct ggml_tensor * ggml_cont_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
    ggml_format_name(result, "%s (cont)", a->name);
xuxzh1's avatar
init  
xuxzh1 committed
2950

xuxzh1's avatar
update  
xuxzh1 committed
2951
    result->op     = GGML_OP_CONT;
xuxzh1's avatar
init  
xuxzh1 committed
2952
2953
2954
2955
2956
    result->src[0] = a;

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
2957
struct ggml_tensor * ggml_cont(
xuxzh1's avatar
init  
xuxzh1 committed
2958
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
2959
2960
        struct ggml_tensor * a) {
    return ggml_cont_impl(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
2961
2962
}

xuxzh1's avatar
update  
xuxzh1 committed
2963
2964
// make contiguous, with new shape
GGML_API struct ggml_tensor * ggml_cont_1d(
xuxzh1's avatar
init  
xuxzh1 committed
2965
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
2966
2967
2968
        struct ggml_tensor  * a,
        int64_t               ne0) {
    return ggml_cont_4d(ctx, a, ne0, 1, 1, 1);
xuxzh1's avatar
init  
xuxzh1 committed
2969
2970
}

xuxzh1's avatar
update  
xuxzh1 committed
2971
GGML_API struct ggml_tensor * ggml_cont_2d(
xuxzh1's avatar
init  
xuxzh1 committed
2972
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
2973
2974
2975
2976
2977
        struct ggml_tensor  * a,
        int64_t               ne0,
        int64_t               ne1) {
    return ggml_cont_4d(ctx, a, ne0, ne1, 1, 1);
}
xuxzh1's avatar
init  
xuxzh1 committed
2978

xuxzh1's avatar
update  
xuxzh1 committed
2979
2980
2981
2982
2983
2984
2985
2986
GGML_API struct ggml_tensor * ggml_cont_3d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int64_t               ne0,
        int64_t               ne1,
        int64_t               ne2) {
    return ggml_cont_4d(ctx, a, ne0, ne1, ne2, 1);
}
xuxzh1's avatar
init  
xuxzh1 committed
2987

xuxzh1's avatar
update  
xuxzh1 committed
2988
2989
2990
2991
2992
2993
2994
2995
struct ggml_tensor * ggml_cont_4d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int64_t               ne0,
        int64_t               ne1,
        int64_t               ne2,
        int64_t               ne3) {
    GGML_ASSERT(ggml_nelements(a) == (ne0*ne1*ne2*ne3));
xuxzh1's avatar
init  
xuxzh1 committed
2996

xuxzh1's avatar
update  
xuxzh1 committed
2997
2998
    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
    ggml_format_name(result, "%s (cont)", a->name);
xuxzh1's avatar
init  
xuxzh1 committed
2999

xuxzh1's avatar
update  
xuxzh1 committed
3000
    result->op     = GGML_OP_CONT;
xuxzh1's avatar
init  
xuxzh1 committed
3001
3002
3003
3004
3005
    result->src[0] = a;

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3006
// ggml_reshape
xuxzh1's avatar
init  
xuxzh1 committed
3007

xuxzh1's avatar
update  
xuxzh1 committed
3008
struct ggml_tensor * ggml_reshape(
xuxzh1's avatar
init  
xuxzh1 committed
3009
3010
3011
        struct ggml_context * ctx,
        struct ggml_tensor * a,
        struct ggml_tensor * b) {
xuxzh1's avatar
update  
xuxzh1 committed
3012
3013
3014
    GGML_ASSERT(ggml_is_contiguous(a));
    // as only the shape of b is relevant, and not its memory layout, b is allowed to be non contiguous.
    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
xuxzh1's avatar
init  
xuxzh1 committed
3015

xuxzh1's avatar
update  
xuxzh1 committed
3016
3017
    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
    ggml_format_name(result, "%s (reshaped)", a->name);
xuxzh1's avatar
init  
xuxzh1 committed
3018

xuxzh1's avatar
update  
xuxzh1 committed
3019
3020
    result->op     = GGML_OP_RESHAPE;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
3021

xuxzh1's avatar
update  
xuxzh1 committed
3022
3023
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
3024

xuxzh1's avatar
update  
xuxzh1 committed
3025
3026
3027
3028
3029
3030
struct ggml_tensor * ggml_reshape_1d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int64_t               ne0) {
    GGML_ASSERT(ggml_is_contiguous(a));
    GGML_ASSERT(ggml_nelements(a) == ne0);
xuxzh1's avatar
init  
xuxzh1 committed
3031

xuxzh1's avatar
update  
xuxzh1 committed
3032
3033
3034
    const int64_t ne[1] = { ne0 };
    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
    ggml_format_name(result, "%s (reshaped)", a->name);
xuxzh1's avatar
init  
xuxzh1 committed
3035

xuxzh1's avatar
update  
xuxzh1 committed
3036
    result->op     = GGML_OP_RESHAPE;
xuxzh1's avatar
init  
xuxzh1 committed
3037
3038
3039
3040
3041
    result->src[0] = a;

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3042
struct ggml_tensor * ggml_reshape_2d(
xuxzh1's avatar
init  
xuxzh1 committed
3043
3044
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
xuxzh1's avatar
update  
xuxzh1 committed
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
        int64_t               ne0,
        int64_t               ne1) {
    GGML_ASSERT(ggml_is_contiguous(a));
    GGML_ASSERT(ggml_nelements(a) == ne0*ne1);

    const int64_t ne[2] = { ne0, ne1 };
    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
    ggml_format_name(result, "%s (reshaped)", a->name);

    result->op     = GGML_OP_RESHAPE;
    result->src[0] = a;

    return result;
xuxzh1's avatar
init  
xuxzh1 committed
3058
3059
}

xuxzh1's avatar
update  
xuxzh1 committed
3060
struct ggml_tensor * ggml_reshape_3d(
xuxzh1's avatar
init  
xuxzh1 committed
3061
3062
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
xuxzh1's avatar
update  
xuxzh1 committed
3063
3064
3065
3066
3067
        int64_t               ne0,
        int64_t               ne1,
        int64_t               ne2) {
    GGML_ASSERT(ggml_is_contiguous(a));
    GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2);
xuxzh1's avatar
init  
xuxzh1 committed
3068

xuxzh1's avatar
update  
xuxzh1 committed
3069
3070
3071
    const int64_t ne[3] = { ne0, ne1, ne2 };
    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
    ggml_format_name(result, "%s (reshaped)", a->name);
xuxzh1's avatar
init  
xuxzh1 committed
3072

xuxzh1's avatar
update  
xuxzh1 committed
3073
3074
    result->op     = GGML_OP_RESHAPE;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
3075

xuxzh1's avatar
update  
xuxzh1 committed
3076
3077
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
3078

xuxzh1's avatar
update  
xuxzh1 committed
3079
3080
3081
3082
3083
3084
3085
3086
3087
struct ggml_tensor * ggml_reshape_4d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int64_t               ne0,
        int64_t               ne1,
        int64_t               ne2,
        int64_t               ne3) {
    GGML_ASSERT(ggml_is_contiguous(a));
    GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2*ne3);
xuxzh1's avatar
init  
xuxzh1 committed
3088

xuxzh1's avatar
update  
xuxzh1 committed
3089
3090
3091
    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
    ggml_format_name(result, "%s (reshaped)", a->name);
xuxzh1's avatar
init  
xuxzh1 committed
3092

xuxzh1's avatar
update  
xuxzh1 committed
3093
    result->op     = GGML_OP_RESHAPE;
xuxzh1's avatar
init  
xuxzh1 committed
3094
3095
3096
3097
3098
    result->src[0] = a;

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3099
static struct ggml_tensor * ggml_view_impl(
xuxzh1's avatar
init  
xuxzh1 committed
3100
3101
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
xuxzh1's avatar
update  
xuxzh1 committed
3102
3103
3104
3105
3106
        int                   n_dims,
        const int64_t       * ne,
        size_t                offset) {
    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
    ggml_format_name(result, "%s (view)", a->name);
xuxzh1's avatar
init  
xuxzh1 committed
3107

xuxzh1's avatar
update  
xuxzh1 committed
3108
    ggml_set_op_params(result, &offset, sizeof(offset));
xuxzh1's avatar
init  
xuxzh1 committed
3109

xuxzh1's avatar
update  
xuxzh1 committed
3110
    result->op     = GGML_OP_VIEW;
xuxzh1's avatar
init  
xuxzh1 committed
3111
3112
3113
3114
3115
    result->src[0] = a;

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3116
// ggml_view_1d
xuxzh1's avatar
init  
xuxzh1 committed
3117

xuxzh1's avatar
update  
xuxzh1 committed
3118
struct ggml_tensor * ggml_view_1d(
xuxzh1's avatar
init  
xuxzh1 committed
3119
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3120
3121
3122
3123
3124
3125
        struct ggml_tensor  * a,
        int64_t               ne0,
        size_t                offset) {
    struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);

    return result;
xuxzh1's avatar
init  
xuxzh1 committed
3126
3127
}

xuxzh1's avatar
update  
xuxzh1 committed
3128
// ggml_view_2d
xuxzh1's avatar
init  
xuxzh1 committed
3129

xuxzh1's avatar
update  
xuxzh1 committed
3130
struct ggml_tensor * ggml_view_2d(
xuxzh1's avatar
init  
xuxzh1 committed
3131
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3132
3133
3134
3135
3136
3137
        struct ggml_tensor  * a,
        int64_t               ne0,
        int64_t               ne1,
        size_t                nb1,
        size_t                offset) {
    const int64_t ne[2] = { ne0, ne1 };
xuxzh1's avatar
init  
xuxzh1 committed
3138

xuxzh1's avatar
update  
xuxzh1 committed
3139
    struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
xuxzh1's avatar
init  
xuxzh1 committed
3140

xuxzh1's avatar
update  
xuxzh1 committed
3141
3142
3143
    result->nb[1] = nb1;
    result->nb[2] = result->nb[1]*ne1;
    result->nb[3] = result->nb[2];
xuxzh1's avatar
init  
xuxzh1 committed
3144
3145
3146
3147

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3148
// ggml_view_3d
xuxzh1's avatar
init  
xuxzh1 committed
3149

xuxzh1's avatar
update  
xuxzh1 committed
3150
struct ggml_tensor * ggml_view_3d(
xuxzh1's avatar
init  
xuxzh1 committed
3151
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
        struct ggml_tensor  * a,
        int64_t               ne0,
        int64_t               ne1,
        int64_t               ne2,
        size_t                nb1,
        size_t                nb2,
        size_t                offset) {
    const int64_t ne[3] = { ne0, ne1, ne2 };

    struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);

    result->nb[1] = nb1;
    result->nb[2] = nb2;
    result->nb[3] = result->nb[2]*ne2;

    return result;
xuxzh1's avatar
init  
xuxzh1 committed
3168
3169
}

xuxzh1's avatar
update  
xuxzh1 committed
3170
// ggml_view_4d
xuxzh1's avatar
init  
xuxzh1 committed
3171

xuxzh1's avatar
update  
xuxzh1 committed
3172
struct ggml_tensor * ggml_view_4d(
xuxzh1's avatar
init  
xuxzh1 committed
3173
3174
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
xuxzh1's avatar
update  
xuxzh1 committed
3175
3176
3177
3178
3179
3180
3181
3182
3183
        int64_t               ne0,
        int64_t               ne1,
        int64_t               ne2,
        int64_t               ne3,
        size_t                nb1,
        size_t                nb2,
        size_t                nb3,
        size_t                offset) {
    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
xuxzh1's avatar
init  
xuxzh1 committed
3184

xuxzh1's avatar
update  
xuxzh1 committed
3185
    struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
xuxzh1's avatar
init  
xuxzh1 committed
3186

xuxzh1's avatar
update  
xuxzh1 committed
3187
3188
3189
    result->nb[1] = nb1;
    result->nb[2] = nb2;
    result->nb[3] = nb3;
xuxzh1's avatar
init  
xuxzh1 committed
3190
3191
3192
3193

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3194
// ggml_permute
xuxzh1's avatar
init  
xuxzh1 committed
3195

xuxzh1's avatar
update  
xuxzh1 committed
3196
struct ggml_tensor * ggml_permute(
xuxzh1's avatar
init  
xuxzh1 committed
3197
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3198
3199
3200
3201
3202
3203
3204
3205
3206
        struct ggml_tensor  * a,
        int                   axis0,
        int                   axis1,
        int                   axis2,
        int                   axis3) {
    GGML_ASSERT(axis0 >= 0 && axis0 < GGML_MAX_DIMS);
    GGML_ASSERT(axis1 >= 0 && axis1 < GGML_MAX_DIMS);
    GGML_ASSERT(axis2 >= 0 && axis2 < GGML_MAX_DIMS);
    GGML_ASSERT(axis3 >= 0 && axis3 < GGML_MAX_DIMS);
xuxzh1's avatar
init  
xuxzh1 committed
3207

xuxzh1's avatar
update  
xuxzh1 committed
3208
3209
3210
3211
3212
3213
    GGML_ASSERT(axis0 != axis1);
    GGML_ASSERT(axis0 != axis2);
    GGML_ASSERT(axis0 != axis3);
    GGML_ASSERT(axis1 != axis2);
    GGML_ASSERT(axis1 != axis3);
    GGML_ASSERT(axis2 != axis3);
xuxzh1's avatar
init  
xuxzh1 committed
3214

xuxzh1's avatar
update  
xuxzh1 committed
3215
3216
    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
    ggml_format_name(result, "%s (permuted)", a->name);
xuxzh1's avatar
init  
xuxzh1 committed
3217

xuxzh1's avatar
update  
xuxzh1 committed
3218
3219
    int ne[GGML_MAX_DIMS];
    int nb[GGML_MAX_DIMS];
xuxzh1's avatar
init  
xuxzh1 committed
3220

xuxzh1's avatar
update  
xuxzh1 committed
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
    ne[axis0] = a->ne[0];
    ne[axis1] = a->ne[1];
    ne[axis2] = a->ne[2];
    ne[axis3] = a->ne[3];

    nb[axis0] = a->nb[0];
    nb[axis1] = a->nb[1];
    nb[axis2] = a->nb[2];
    nb[axis3] = a->nb[3];

    result->ne[0] = ne[0];
    result->ne[1] = ne[1];
    result->ne[2] = ne[2];
    result->ne[3] = ne[3];

    result->nb[0] = nb[0];
    result->nb[1] = nb[1];
    result->nb[2] = nb[2];
    result->nb[3] = nb[3];
xuxzh1's avatar
init  
xuxzh1 committed
3240

xuxzh1's avatar
update  
xuxzh1 committed
3241
    result->op     = GGML_OP_PERMUTE;
xuxzh1's avatar
init  
xuxzh1 committed
3242
3243
    result->src[0] = a;

xuxzh1's avatar
update  
xuxzh1 committed
3244
3245
3246
    int32_t params[] = { axis0, axis1, axis2, axis3 };
    ggml_set_op_params(result, params, sizeof(params));

xuxzh1's avatar
init  
xuxzh1 committed
3247
3248
3249
    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3250
// ggml_transpose
xuxzh1's avatar
init  
xuxzh1 committed
3251

xuxzh1's avatar
update  
xuxzh1 committed
3252
struct ggml_tensor * ggml_transpose(
xuxzh1's avatar
init  
xuxzh1 committed
3253
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3254
3255
3256
        struct ggml_tensor  * a) {
    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
    ggml_format_name(result, "%s (transposed)", a->name);
xuxzh1's avatar
init  
xuxzh1 committed
3257

xuxzh1's avatar
update  
xuxzh1 committed
3258
3259
    result->ne[0] = a->ne[1];
    result->ne[1] = a->ne[0];
xuxzh1's avatar
init  
xuxzh1 committed
3260

xuxzh1's avatar
update  
xuxzh1 committed
3261
3262
    result->nb[0] = a->nb[1];
    result->nb[1] = a->nb[0];
xuxzh1's avatar
init  
xuxzh1 committed
3263

xuxzh1's avatar
update  
xuxzh1 committed
3264
    result->op     = GGML_OP_TRANSPOSE;
xuxzh1's avatar
init  
xuxzh1 committed
3265
3266
3267
3268
3269
    result->src[0] = a;

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3270
// ggml_get_rows
xuxzh1's avatar
init  
xuxzh1 committed
3271

xuxzh1's avatar
update  
xuxzh1 committed
3272
struct ggml_tensor * ggml_get_rows(
xuxzh1's avatar
init  
xuxzh1 committed
3273
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3274
3275
3276
3277
3278
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    GGML_ASSERT(a->ne[2] == b->ne[1]);
    GGML_ASSERT(b->ne[3] == 1);
    GGML_ASSERT(b->type == GGML_TYPE_I32);
xuxzh1's avatar
init  
xuxzh1 committed
3279

xuxzh1's avatar
update  
xuxzh1 committed
3280
3281
3282
3283
    // TODO: implement non F32 return
    enum ggml_type type = GGML_TYPE_F32;
    if (a->type == GGML_TYPE_I32) {
        type = a->type;
xuxzh1's avatar
init  
xuxzh1 committed
3284
    }
xuxzh1's avatar
update  
xuxzh1 committed
3285
    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
xuxzh1's avatar
init  
xuxzh1 committed
3286

xuxzh1's avatar
update  
xuxzh1 committed
3287
    result->op     = GGML_OP_GET_ROWS;
xuxzh1's avatar
init  
xuxzh1 committed
3288
    result->src[0] = a;
xuxzh1's avatar
update  
xuxzh1 committed
3289
    result->src[1] = b;
xuxzh1's avatar
init  
xuxzh1 committed
3290
3291
3292
3293

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3294
// ggml_get_rows_back
xuxzh1's avatar
init  
xuxzh1 committed
3295

xuxzh1's avatar
update  
xuxzh1 committed
3296
struct ggml_tensor * ggml_get_rows_back(
xuxzh1's avatar
init  
xuxzh1 committed
3297
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3298
3299
3300
3301
3302
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        struct ggml_tensor  * c) {
    GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
    GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0]));
xuxzh1's avatar
init  
xuxzh1 committed
3303

xuxzh1's avatar
update  
xuxzh1 committed
3304
3305
3306
    // TODO: implement non F32 return
    //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);
xuxzh1's avatar
init  
xuxzh1 committed
3307

xuxzh1's avatar
update  
xuxzh1 committed
3308
    result->op     = GGML_OP_GET_ROWS_BACK;
xuxzh1's avatar
init  
xuxzh1 committed
3309
    result->src[0] = a;
xuxzh1's avatar
update  
xuxzh1 committed
3310
    result->src[1] = b;
xuxzh1's avatar
init  
xuxzh1 committed
3311
3312
3313
3314

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3315
// ggml_diag
xuxzh1's avatar
init  
xuxzh1 committed
3316

xuxzh1's avatar
update  
xuxzh1 committed
3317
struct ggml_tensor * ggml_diag(
xuxzh1's avatar
init  
xuxzh1 committed
3318
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3319
3320
        struct ggml_tensor  * a) {
    GGML_ASSERT(a->ne[1] == 1);
xuxzh1's avatar
init  
xuxzh1 committed
3321

xuxzh1's avatar
update  
xuxzh1 committed
3322
3323
    const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne);
xuxzh1's avatar
init  
xuxzh1 committed
3324

xuxzh1's avatar
update  
xuxzh1 committed
3325
    result->op     = GGML_OP_DIAG;
xuxzh1's avatar
init  
xuxzh1 committed
3326
3327
3328
3329
3330
    result->src[0] = a;

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3331
// ggml_diag_mask_inf
xuxzh1's avatar
init  
xuxzh1 committed
3332

xuxzh1's avatar
update  
xuxzh1 committed
3333
static struct ggml_tensor * ggml_diag_mask_inf_impl(
xuxzh1's avatar
init  
xuxzh1 committed
3334
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3335
3336
3337
3338
        struct ggml_tensor  * a,
        int                   n_past,
        bool                  inplace) {
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
3339

xuxzh1's avatar
update  
xuxzh1 committed
3340
3341
    int32_t params[] = { n_past };
    ggml_set_op_params(result, params, sizeof(params));
xuxzh1's avatar
init  
xuxzh1 committed
3342

xuxzh1's avatar
update  
xuxzh1 committed
3343
    result->op     = GGML_OP_DIAG_MASK_INF;
xuxzh1's avatar
init  
xuxzh1 committed
3344
3345
3346
3347
3348
    result->src[0] = a;

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3349
3350
3351
3352
3353
3354
struct ggml_tensor * ggml_diag_mask_inf(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   n_past) {
    return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
}
xuxzh1's avatar
init  
xuxzh1 committed
3355

xuxzh1's avatar
update  
xuxzh1 committed
3356
3357
3358
3359
3360
3361
struct ggml_tensor * ggml_diag_mask_inf_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   n_past) {
    return ggml_diag_mask_inf_impl(ctx, a, n_past, true);
}
xuxzh1's avatar
init  
xuxzh1 committed
3362

xuxzh1's avatar
update  
xuxzh1 committed
3363
// ggml_diag_mask_zero
xuxzh1's avatar
init  
xuxzh1 committed
3364

xuxzh1's avatar
update  
xuxzh1 committed
3365
3366
3367
3368
3369
3370
static struct ggml_tensor * ggml_diag_mask_zero_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   n_past,
        bool                  inplace) {
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
3371

xuxzh1's avatar
update  
xuxzh1 committed
3372
3373
    int32_t params[] = { n_past };
    ggml_set_op_params(result, params, sizeof(params));
xuxzh1's avatar
init  
xuxzh1 committed
3374

xuxzh1's avatar
update  
xuxzh1 committed
3375
    result->op     = GGML_OP_DIAG_MASK_ZERO;
xuxzh1's avatar
init  
xuxzh1 committed
3376
3377
3378
3379
3380
    result->src[0] = a;

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3381
struct ggml_tensor * ggml_diag_mask_zero(
xuxzh1's avatar
init  
xuxzh1 committed
3382
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3383
3384
3385
        struct ggml_tensor  * a,
        int                   n_past) {
    return ggml_diag_mask_zero_impl(ctx, a, n_past, false);
xuxzh1's avatar
init  
xuxzh1 committed
3386
3387
}

xuxzh1's avatar
update  
xuxzh1 committed
3388
struct ggml_tensor * ggml_diag_mask_zero_inplace(
xuxzh1's avatar
init  
xuxzh1 committed
3389
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3390
3391
3392
        struct ggml_tensor  * a,
        int                   n_past) {
    return ggml_diag_mask_zero_impl(ctx, a, n_past, true);
xuxzh1's avatar
init  
xuxzh1 committed
3393
3394
}

xuxzh1's avatar
update  
xuxzh1 committed
3395
// ggml_soft_max
xuxzh1's avatar
init  
xuxzh1 committed
3396

xuxzh1's avatar
update  
xuxzh1 committed
3397
static struct ggml_tensor * ggml_soft_max_impl(
xuxzh1's avatar
init  
xuxzh1 committed
3398
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3399
3400
3401
3402
3403
3404
        struct ggml_tensor  * a,
        struct ggml_tensor  * mask,
        float                 scale,
        float                 max_bias,
        bool                  inplace) {
    GGML_ASSERT(ggml_is_contiguous(a));
xuxzh1's avatar
init  
xuxzh1 committed
3405

xuxzh1's avatar
update  
xuxzh1 committed
3406
3407
3408
3409
3410
3411
3412
    if (mask) {
        GGML_ASSERT(mask->type == GGML_TYPE_F16 || mask->type == GGML_TYPE_F32);
        GGML_ASSERT(ggml_is_contiguous(mask));
        GGML_ASSERT(ggml_is_matrix(mask));
        GGML_ASSERT(mask->ne[0] == a->ne[0]);
        GGML_ASSERT(mask->ne[1] >= a->ne[1]);
    }
xuxzh1's avatar
init  
xuxzh1 committed
3413

xuxzh1's avatar
update  
xuxzh1 committed
3414
3415
3416
    if (max_bias > 0.0f) {
        GGML_ASSERT(mask);
    }
xuxzh1's avatar
init  
xuxzh1 committed
3417

xuxzh1's avatar
update  
xuxzh1 committed
3418
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
3419

xuxzh1's avatar
update  
xuxzh1 committed
3420
3421
    float params[] = { scale, max_bias };
    ggml_set_op_params(result, params, sizeof(params));
xuxzh1's avatar
init  
xuxzh1 committed
3422

xuxzh1's avatar
update  
xuxzh1 committed
3423
3424
3425
    result->op     = GGML_OP_SOFT_MAX;
    result->src[0] = a;
    result->src[1] = mask;
xuxzh1's avatar
init  
xuxzh1 committed
3426

xuxzh1's avatar
update  
xuxzh1 committed
3427
    return result;
xuxzh1's avatar
init  
xuxzh1 committed
3428
3429
}

xuxzh1's avatar
update  
xuxzh1 committed
3430
struct ggml_tensor * ggml_soft_max(
xuxzh1's avatar
init  
xuxzh1 committed
3431
3432
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
xuxzh1's avatar
update  
xuxzh1 committed
3433
    return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, false);
xuxzh1's avatar
init  
xuxzh1 committed
3434
3435
}

xuxzh1's avatar
update  
xuxzh1 committed
3436
struct ggml_tensor * ggml_soft_max_inplace(
xuxzh1's avatar
init  
xuxzh1 committed
3437
3438
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
xuxzh1's avatar
update  
xuxzh1 committed
3439
    return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, true);
xuxzh1's avatar
init  
xuxzh1 committed
3440
3441
}

xuxzh1's avatar
update  
xuxzh1 committed
3442
struct ggml_tensor * ggml_soft_max_ext(
xuxzh1's avatar
init  
xuxzh1 committed
3443
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3444
3445
3446
3447
3448
        struct ggml_tensor  * a,
        struct ggml_tensor  * mask,
        float                 scale,
        float                 max_bias) {
    return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
xuxzh1's avatar
init  
xuxzh1 committed
3449
3450
}

xuxzh1's avatar
update  
xuxzh1 committed
3451
// ggml_soft_max_back
xuxzh1's avatar
init  
xuxzh1 committed
3452

xuxzh1's avatar
update  
xuxzh1 committed
3453
3454
3455
3456
3457
3458
static struct ggml_tensor * ggml_soft_max_back_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        bool                  inplace) {
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
3459

xuxzh1's avatar
update  
xuxzh1 committed
3460
3461
3462
    result->op     = GGML_OP_SOFT_MAX_BACK;
    result->src[0] = a;
    result->src[1] = b;
xuxzh1's avatar
init  
xuxzh1 committed
3463

xuxzh1's avatar
update  
xuxzh1 committed
3464
3465
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
3466

xuxzh1's avatar
update  
xuxzh1 committed
3467
struct ggml_tensor * ggml_soft_max_back(
xuxzh1's avatar
init  
xuxzh1 committed
3468
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3469
3470
3471
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_soft_max_back_impl(ctx, a, b, false);
xuxzh1's avatar
init  
xuxzh1 committed
3472
3473
}

xuxzh1's avatar
update  
xuxzh1 committed
3474
struct ggml_tensor * ggml_soft_max_back_inplace(
xuxzh1's avatar
init  
xuxzh1 committed
3475
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3476
3477
3478
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_soft_max_back_impl(ctx, a, b, true);
xuxzh1's avatar
init  
xuxzh1 committed
3479
3480
}

xuxzh1's avatar
update  
xuxzh1 committed
3481
// ggml_rope
xuxzh1's avatar
init  
xuxzh1 committed
3482

xuxzh1's avatar
update  
xuxzh1 committed
3483
static struct ggml_tensor * ggml_rope_impl(
xuxzh1's avatar
init  
xuxzh1 committed
3484
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        struct ggml_tensor  * c,
        int                   n_dims,
        int                   mode,
        int                   n_ctx_orig,
        float                 freq_base,
        float                 freq_scale,
        float                 ext_factor,
        float                 attn_factor,
        float                 beta_fast,
        float                 beta_slow,
        bool                  inplace) {
    GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");

    GGML_ASSERT(ggml_is_vector(b));
    GGML_ASSERT(b->type == GGML_TYPE_I32);
    GGML_ASSERT(a->ne[2] == b->ne[0]);
xuxzh1's avatar
init  
xuxzh1 committed
3503

xuxzh1's avatar
update  
xuxzh1 committed
3504
3505
3506
    if (c) {
        GGML_ASSERT(c->type == GGML_TYPE_F32);
        GGML_ASSERT(c->ne[0] >= n_dims / 2);
xuxzh1's avatar
init  
xuxzh1 committed
3507
3508
3509
3510
    }

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

xuxzh1's avatar
update  
xuxzh1 committed
3511
3512
3513
3514
3515
3516
3517
3518
    int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
    memcpy(params +  5, &freq_base,    sizeof(float));
    memcpy(params +  6, &freq_scale,   sizeof(float));
    memcpy(params +  7, &ext_factor,   sizeof(float));
    memcpy(params +  8, &attn_factor,  sizeof(float));
    memcpy(params +  9, &beta_fast,    sizeof(float));
    memcpy(params + 10, &beta_slow,    sizeof(float));
    ggml_set_op_params(result, params, sizeof(params));
xuxzh1's avatar
init  
xuxzh1 committed
3519

xuxzh1's avatar
update  
xuxzh1 committed
3520
    result->op     = GGML_OP_ROPE;
xuxzh1's avatar
init  
xuxzh1 committed
3521
    result->src[0] = a;
xuxzh1's avatar
update  
xuxzh1 committed
3522
3523
    result->src[1] = b;
    result->src[2] = c;
xuxzh1's avatar
init  
xuxzh1 committed
3524
3525
3526
3527

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3528
struct ggml_tensor * ggml_rope(
xuxzh1's avatar
init  
xuxzh1 committed
3529
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3530
3531
3532
3533
3534
3535
3536
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        int                   n_dims,
        int                   mode) {
    return ggml_rope_impl(
        ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
    );
xuxzh1's avatar
init  
xuxzh1 committed
3537
3538
}

xuxzh1's avatar
update  
xuxzh1 committed
3539
struct ggml_tensor * ggml_rope_inplace(
xuxzh1's avatar
init  
xuxzh1 committed
3540
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3541
3542
3543
3544
3545
3546
3547
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        int                   n_dims,
        int                   mode) {
    return ggml_rope_impl(
        ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
    );
xuxzh1's avatar
init  
xuxzh1 committed
3548
3549
}

xuxzh1's avatar
update  
xuxzh1 committed
3550
struct ggml_tensor * ggml_rope_ext(
xuxzh1's avatar
init  
xuxzh1 committed
3551
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        struct ggml_tensor  * c,
        int                   n_dims,
        int                   mode,
        int                   n_ctx_orig,
        float                 freq_base,
        float                 freq_scale,
        float                 ext_factor,
        float                 attn_factor,
        float                 beta_fast,
        float                 beta_slow) {
    return ggml_rope_impl(
        ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
        ext_factor, attn_factor, beta_fast, beta_slow, false
    );
xuxzh1's avatar
init  
xuxzh1 committed
3568
3569
}

xuxzh1's avatar
update  
xuxzh1 committed
3570
struct ggml_tensor * ggml_rope_ext_inplace(
xuxzh1's avatar
init  
xuxzh1 committed
3571
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        struct ggml_tensor  * c,
        int                   n_dims,
        int                   mode,
        int                   n_ctx_orig,
        float                 freq_base,
        float                 freq_scale,
        float                 ext_factor,
        float                 attn_factor,
        float                 beta_fast,
        float                 beta_slow) {
    return ggml_rope_impl(
        ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
        ext_factor, attn_factor, beta_fast, beta_slow, true
    );
xuxzh1's avatar
init  
xuxzh1 committed
3588
3589
}

xuxzh1's avatar
update  
xuxzh1 committed
3590
struct ggml_tensor * ggml_rope_custom(
xuxzh1's avatar
init  
xuxzh1 committed
3591
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        int                   n_dims,
        int                   mode,
        int                   n_ctx_orig,
        float                 freq_base,
        float                 freq_scale,
        float                 ext_factor,
        float                 attn_factor,
        float                 beta_fast,
        float                 beta_slow) {
    return ggml_rope_impl(
        ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
        ext_factor, attn_factor, beta_fast, beta_slow, false
    );
xuxzh1's avatar
init  
xuxzh1 committed
3607
3608
}

xuxzh1's avatar
update  
xuxzh1 committed
3609
struct ggml_tensor * ggml_rope_custom_inplace(
xuxzh1's avatar
init  
xuxzh1 committed
3610
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        int                   n_dims,
        int                   mode,
        int                   n_ctx_orig,
        float                 freq_base,
        float                 freq_scale,
        float                 ext_factor,
        float                 attn_factor,
        float                 beta_fast,
        float                 beta_slow) {
    return ggml_rope_impl(
        ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
        ext_factor, attn_factor, beta_fast, beta_slow, true
    );
xuxzh1's avatar
init  
xuxzh1 committed
3626
3627
}

xuxzh1's avatar
update  
xuxzh1 committed
3628
3629
3630
3631
3632
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
    return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
}
xuxzh1's avatar
init  
xuxzh1 committed
3633

xuxzh1's avatar
update  
xuxzh1 committed
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
void ggml_rope_yarn_corr_dims(
    int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
) {
    // start and end correction dims
    float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
    float end   =  ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
    dims[0] = MAX(0, start);
    dims[1] = MIN(n_dims - 1, end);
}

// ggml_rope_back

struct ggml_tensor * ggml_rope_back(
xuxzh1's avatar
init  
xuxzh1 committed
3647
3648
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
xuxzh1's avatar
update  
xuxzh1 committed
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
        struct ggml_tensor  * b,
        struct ggml_tensor  * c,
        int                   n_dims,
        int                   mode,
        int                   n_ctx_orig,
        float                 freq_base,
        float                 freq_scale,
        float                 ext_factor,
        float                 attn_factor,
        float                 beta_fast,
        float                 beta_slow) {
    GGML_ASSERT(ggml_is_vector(b));
    GGML_ASSERT(b->type == GGML_TYPE_I32);
    GGML_ASSERT(a->ne[2] == b->ne[0]);
xuxzh1's avatar
init  
xuxzh1 committed
3663
3664
3665

    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);

xuxzh1's avatar
update  
xuxzh1 committed
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
    int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
    memcpy(params +  5, &freq_base,    sizeof(float));
    memcpy(params +  6, &freq_scale,   sizeof(float));
    memcpy(params +  7, &ext_factor,   sizeof(float));
    memcpy(params +  8, &attn_factor,  sizeof(float));
    memcpy(params +  9, &beta_fast,    sizeof(float));
    memcpy(params + 10, &beta_slow,    sizeof(float));
    ggml_set_op_params(result, params, sizeof(params));

    result->op     = GGML_OP_ROPE_BACK;
xuxzh1's avatar
init  
xuxzh1 committed
3676
3677
    result->src[0] = a;
    result->src[1] = b;
xuxzh1's avatar
update  
xuxzh1 committed
3678
    result->src[2] = c;
xuxzh1's avatar
init  
xuxzh1 committed
3679
3680
3681
3682

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3683
// ggml_clamp
xuxzh1's avatar
init  
xuxzh1 committed
3684

xuxzh1's avatar
update  
xuxzh1 committed
3685
struct ggml_tensor * ggml_clamp(
xuxzh1's avatar
init  
xuxzh1 committed
3686
3687
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
xuxzh1's avatar
update  
xuxzh1 committed
3688
3689
3690
3691
        float                 min,
        float                 max) {
    // TODO: when implement backward, fix this:
    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
3692

xuxzh1's avatar
update  
xuxzh1 committed
3693
3694
    float params[] = { min, max };
    ggml_set_op_params(result, params, sizeof(params));
xuxzh1's avatar
init  
xuxzh1 committed
3695

xuxzh1's avatar
update  
xuxzh1 committed
3696
    result->op     = GGML_OP_CLAMP;
xuxzh1's avatar
init  
xuxzh1 committed
3697
3698
3699
3700
3701
    result->src[0] = a;

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3702
// ggml_conv_1d
xuxzh1's avatar
init  
xuxzh1 committed
3703

xuxzh1's avatar
update  
xuxzh1 committed
3704
3705
static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
    return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
xuxzh1's avatar
init  
xuxzh1 committed
3706
3707
}

xuxzh1's avatar
update  
xuxzh1 committed
3708
GGML_API struct ggml_tensor * ggml_conv_1d(
xuxzh1's avatar
init  
xuxzh1 committed
3709
3710
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
xuxzh1's avatar
update  
xuxzh1 committed
3711
3712
3713
3714
3715
        struct ggml_tensor  * b,
        int                   s0,
        int                   p0,
        int                   d0) {
    struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]
xuxzh1's avatar
init  
xuxzh1 committed
3716

xuxzh1's avatar
update  
xuxzh1 committed
3717
3718
3719
3720
    struct ggml_tensor * result =
        ggml_mul_mat(ctx,
                ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
                ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2]));                    // [OC,IC, K] => [OC, IC * K]
xuxzh1's avatar
init  
xuxzh1 committed
3721

xuxzh1's avatar
update  
xuxzh1 committed
3722
    result = ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
xuxzh1's avatar
init  
xuxzh1 committed
3723
3724
3725
3726

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3727
// ggml_conv_1d_ph
xuxzh1's avatar
init  
xuxzh1 committed
3728

xuxzh1's avatar
update  
xuxzh1 committed
3729
struct ggml_tensor* ggml_conv_1d_ph(
xuxzh1's avatar
init  
xuxzh1 committed
3730
3731
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
xuxzh1's avatar
update  
xuxzh1 committed
3732
3733
3734
3735
        struct ggml_tensor  * b,
        int                   s,
        int                   d) {
    return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
xuxzh1's avatar
init  
xuxzh1 committed
3736
3737
}

xuxzh1's avatar
update  
xuxzh1 committed
3738
// ggml_conv_transpose_1d
xuxzh1's avatar
init  
xuxzh1 committed
3739

xuxzh1's avatar
update  
xuxzh1 committed
3740
3741
3742
3743
3744
static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
    return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
}

GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
xuxzh1's avatar
init  
xuxzh1 committed
3745
3746
3747
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
xuxzh1's avatar
update  
xuxzh1 committed
3748
3749
3750
3751
3752
3753
        int                   s0,
        int                   p0,
        int                   d0) {
    GGML_ASSERT(ggml_is_matrix(b));
    GGML_ASSERT(a->ne[2] == b->ne[1]);
    GGML_ASSERT(a->ne[3] == 1);
xuxzh1's avatar
init  
xuxzh1 committed
3754

xuxzh1's avatar
update  
xuxzh1 committed
3755
3756
    GGML_ASSERT(p0 == 0);
    GGML_ASSERT(d0 == 1);
xuxzh1's avatar
init  
xuxzh1 committed
3757

xuxzh1's avatar
update  
xuxzh1 committed
3758
3759
3760
3761
3762
    const int64_t ne[4] = {
        ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
        a->ne[1], b->ne[2], 1,
    };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
xuxzh1's avatar
init  
xuxzh1 committed
3763

xuxzh1's avatar
update  
xuxzh1 committed
3764
3765
    int32_t params[] = { s0, p0, d0 };
    ggml_set_op_params(result, params, sizeof(params));
xuxzh1's avatar
init  
xuxzh1 committed
3766

xuxzh1's avatar
update  
xuxzh1 committed
3767
    result->op     = GGML_OP_CONV_TRANSPOSE_1D;
xuxzh1's avatar
init  
xuxzh1 committed
3768
3769
3770
3771
3772
3773
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3774
// ggml_conv_depthwise
xuxzh1's avatar
init  
xuxzh1 committed
3775

xuxzh1's avatar
update  
xuxzh1 committed
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
struct ggml_tensor * ggml_conv_depthwise_2d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        int                   s0,
        int                   s1,
        int                   p0,
        int                   p1,
        int                   d0,
        int                   d1) {
    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
                                        ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
                                        s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
    struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
xuxzh1's avatar
init  
xuxzh1 committed
3791

xuxzh1's avatar
update  
xuxzh1 committed
3792
3793
3794
    new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1);                       // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
    struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
xuxzh1's avatar
init  
xuxzh1 committed
3795
3796
3797

    return result;
}
xuxzh1's avatar
update  
xuxzh1 committed
3798
// ggml_conv_2d
xuxzh1's avatar
init  
xuxzh1 committed
3799

xuxzh1's avatar
update  
xuxzh1 committed
3800
3801
3802
3803
3804
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
// a: [OC,IC, KH, KW]
// b: [N, IC, IH, IW]
// result: [N, OH, OW, IC*KH*KW]
struct ggml_tensor * ggml_im2col(
xuxzh1's avatar
init  
xuxzh1 committed
3805
3806
3807
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
xuxzh1's avatar
update  
xuxzh1 committed
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
        int                   s0,
        int                   s1,
        int                   p0,
        int                   p1,
        int                   d0,
        int                   d1,
        bool                  is_2D,
        enum ggml_type        dst_type) {
    if(is_2D) {
        GGML_ASSERT(a->ne[2] == b->ne[2]);
    } else {
        GGML_ASSERT(a->ne[1] == b->ne[1]);
        GGML_ASSERT(b->ne[3] == 1);
xuxzh1's avatar
init  
xuxzh1 committed
3821
3822
    }

xuxzh1's avatar
update  
xuxzh1 committed
3823
3824
    const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
    const int64_t OW =         ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
xuxzh1's avatar
init  
xuxzh1 committed
3825

xuxzh1's avatar
update  
xuxzh1 committed
3826
3827
    GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a");
    GGML_ASSERT((OW > 0)           && "b too small compared to a");
xuxzh1's avatar
init  
xuxzh1 committed
3828

xuxzh1's avatar
update  
xuxzh1 committed
3829
3830
3831
3832
3833
3834
    const int64_t ne[4] = {
        is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
        OW,
        is_2D ? OH : b->ne[2],
        is_2D ?      b->ne[3] : 1,
    };
xuxzh1's avatar
init  
xuxzh1 committed
3835

xuxzh1's avatar
update  
xuxzh1 committed
3836
3837
3838
    struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
    ggml_set_op_params(result, params, sizeof(params));
xuxzh1's avatar
init  
xuxzh1 committed
3839

xuxzh1's avatar
update  
xuxzh1 committed
3840
    result->op     = GGML_OP_IM2COL;
xuxzh1's avatar
init  
xuxzh1 committed
3841
3842
3843
3844
3845
3846
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3847
struct ggml_tensor * ggml_im2col_back(
xuxzh1's avatar
init  
xuxzh1 committed
3848
3849
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
xuxzh1's avatar
update  
xuxzh1 committed
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
        struct ggml_tensor  * b,
        int64_t             * ne,
        int                   s0,
        int                   s1,
        int                   p0,
        int                   p1,
        int                   d0,
        int                   d1,
        bool                  is_2D) {
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
    ggml_set_op_params(result, params, sizeof(params));
xuxzh1's avatar
init  
xuxzh1 committed
3862

xuxzh1's avatar
update  
xuxzh1 committed
3863
    result->op     = GGML_OP_IM2COL_BACK;
xuxzh1's avatar
init  
xuxzh1 committed
3864
    result->src[0] = a;
xuxzh1's avatar
update  
xuxzh1 committed
3865
    result->src[1] = b;
xuxzh1's avatar
init  
xuxzh1 committed
3866
3867
3868
3869

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3870
3871
3872
3873
// a: [OC,IC, KH, KW]
// b: [N, IC, IH, IW]
// result: [N, OC, OH, OW]
struct ggml_tensor * ggml_conv_2d(
xuxzh1's avatar
init  
xuxzh1 committed
3874
3875
3876
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
xuxzh1's avatar
update  
xuxzh1 committed
3877
3878
3879
3880
3881
3882
3883
        int                   s0,
        int                   s1,
        int                   p0,
        int                   p1,
        int                   d0,
        int                   d1) {
    struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, a->type); // [N, OH, OW, IC * KH * KW]
xuxzh1's avatar
init  
xuxzh1 committed
3884

xuxzh1's avatar
update  
xuxzh1 committed
3885
3886
3887
3888
    struct ggml_tensor * result =
        ggml_mul_mat(ctx,
                ggml_reshape_2d(ctx, im2col, im2col->ne[0],  im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
                ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]),  a->ne[3]));                       // [OC,IC, KH, KW] => [OC, IC * KH * KW]
xuxzh1's avatar
init  
xuxzh1 committed
3889

xuxzh1's avatar
update  
xuxzh1 committed
3890
3891
    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], im2col->ne[3], a->ne[3]); // [OC, N, OH, OW]
    result = ggml_cont(ctx, ggml_permute(ctx, result, 0, 1, 3, 2)); // [N, OC, OH, OW]
xuxzh1's avatar
init  
xuxzh1 committed
3892
3893
3894
3895
3896


    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3897
// ggml_conv_2d_sk_p0
xuxzh1's avatar
init  
xuxzh1 committed
3898

xuxzh1's avatar
update  
xuxzh1 committed
3899
struct ggml_tensor * ggml_conv_2d_sk_p0(
xuxzh1's avatar
init  
xuxzh1 committed
3900
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3901
3902
3903
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1);
xuxzh1's avatar
init  
xuxzh1 committed
3904
3905
}

xuxzh1's avatar
update  
xuxzh1 committed
3906
// ggml_conv_2d_s1_ph
xuxzh1's avatar
init  
xuxzh1 committed
3907

xuxzh1's avatar
update  
xuxzh1 committed
3908
struct ggml_tensor * ggml_conv_2d_s1_ph(
xuxzh1's avatar
init  
xuxzh1 committed
3909
        struct ggml_context * ctx,
xuxzh1's avatar
update  
xuxzh1 committed
3910
3911
3912
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
xuxzh1's avatar
init  
xuxzh1 committed
3913
3914
}

xuxzh1's avatar
update  
xuxzh1 committed
3915
// ggml_conv_transpose_2d_p0
xuxzh1's avatar
init  
xuxzh1 committed
3916

xuxzh1's avatar
update  
xuxzh1 committed
3917
3918
static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
    return (ins - 1) * s - 2 * p + ks;
xuxzh1's avatar
init  
xuxzh1 committed
3919
3920
}

xuxzh1's avatar
update  
xuxzh1 committed
3921
struct ggml_tensor * ggml_conv_transpose_2d_p0(
xuxzh1's avatar
init  
xuxzh1 committed
3922
3923
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
xuxzh1's avatar
update  
xuxzh1 committed
3924
3925
3926
        struct ggml_tensor  * b,
        int                   stride) {
    GGML_ASSERT(a->ne[3] == b->ne[2]);
xuxzh1's avatar
init  
xuxzh1 committed
3927

xuxzh1's avatar
update  
xuxzh1 committed
3928
3929
3930
3931
3932
    const int64_t ne[4] = {
        ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/),
        ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/),
        a->ne[2], b->ne[3],
    };
xuxzh1's avatar
init  
xuxzh1 committed
3933

xuxzh1's avatar
update  
xuxzh1 committed
3934
    struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
xuxzh1's avatar
init  
xuxzh1 committed
3935

xuxzh1's avatar
update  
xuxzh1 committed
3936
    ggml_set_op_params_i32(result, 0, stride);
xuxzh1's avatar
init  
xuxzh1 committed
3937

xuxzh1's avatar
update  
xuxzh1 committed
3938
    result->op     = GGML_OP_CONV_TRANSPOSE_2D;
xuxzh1's avatar
init  
xuxzh1 committed
3939
3940
3941
3942
3943
3944
    result->src[0] = a;
    result->src[1] = b;

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
3945
// ggml_pool_*
xuxzh1's avatar
init  
xuxzh1 committed
3946

xuxzh1's avatar
update  
xuxzh1 committed
3947
3948
3949
static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
    return (ins + 2 * p - ks) / s + 1;
}
xuxzh1's avatar
init  
xuxzh1 committed
3950

xuxzh1's avatar
update  
xuxzh1 committed
3951
// ggml_pool_1d
xuxzh1's avatar
init  
xuxzh1 committed
3952

xuxzh1's avatar
update  
xuxzh1 committed
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
struct ggml_tensor * ggml_pool_1d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        enum ggml_op_pool     op,
        int                   k0,
        int                   s0,
        int                   p0) {
    const int64_t ne[4] = {
        ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
        a->ne[1],
        a->ne[2],
        a->ne[3],
    };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
xuxzh1's avatar
init  
xuxzh1 committed
3967

xuxzh1's avatar
update  
xuxzh1 committed
3968
3969
    int32_t params[] = { op, k0, s0, p0 };
    ggml_set_op_params(result, params, sizeof(params));
xuxzh1's avatar
init  
xuxzh1 committed
3970

xuxzh1's avatar
update  
xuxzh1 committed
3971
3972
    result->op     = GGML_OP_POOL_1D;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
3973

xuxzh1's avatar
update  
xuxzh1 committed
3974
    return result;
xuxzh1's avatar
init  
xuxzh1 committed
3975
3976
}

xuxzh1's avatar
update  
xuxzh1 committed
3977
// ggml_pool_2d
xuxzh1's avatar
init  
xuxzh1 committed
3978

xuxzh1's avatar
update  
xuxzh1 committed
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
struct ggml_tensor * ggml_pool_2d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        enum ggml_op_pool     op,
        int                   k0,
        int                   k1,
        int                   s0,
        int                   s1,
        float                 p0,
        float                 p1) {
    struct ggml_tensor * result;
    const int64_t ne[4] = {
        ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
        ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
        a->ne[2],
        a->ne[3],
    };
    result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
xuxzh1's avatar
init  
xuxzh1 committed
3997

xuxzh1's avatar
update  
xuxzh1 committed
3998
3999
    int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
    ggml_set_op_params(result, params, sizeof(params));
xuxzh1's avatar
init  
xuxzh1 committed
4000

xuxzh1's avatar
update  
xuxzh1 committed
4001
4002
    result->op     = GGML_OP_POOL_2D;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
4003

xuxzh1's avatar
update  
xuxzh1 committed
4004
4005
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
4006

xuxzh1's avatar
update  
xuxzh1 committed
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
struct ggml_tensor * ggml_pool_2d_back(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * af,
        enum ggml_op_pool     op,
        int                   k0,
        int                   k1,
        int                   s0,
        int                   s1,
        float                 p0,
        float                 p1) {
    struct ggml_tensor * result;
    result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, af->ne);
xuxzh1's avatar
init  
xuxzh1 committed
4020

xuxzh1's avatar
update  
xuxzh1 committed
4021
4022
    int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
    ggml_set_op_params(result, params, sizeof(params));
xuxzh1's avatar
init  
xuxzh1 committed
4023

xuxzh1's avatar
update  
xuxzh1 committed
4024
4025
4026
    result->op     = GGML_OP_POOL_2D_BACK;
    result->src[0] = a;
    result->src[1] = af;
xuxzh1's avatar
init  
xuxzh1 committed
4027

xuxzh1's avatar
update  
xuxzh1 committed
4028
    return result;
xuxzh1's avatar
init  
xuxzh1 committed
4029
4030
}

xuxzh1's avatar
update  
xuxzh1 committed
4031
// ggml_upscale
xuxzh1's avatar
init  
xuxzh1 committed
4032

xuxzh1's avatar
update  
xuxzh1 committed
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
static struct ggml_tensor * ggml_upscale_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   ne0,
        int                   ne1,
        int                   ne2,
        int                   ne3) {
    GGML_ASSERT(a->ne[0] <= ne0);
    GGML_ASSERT(a->ne[1] <= ne1);
    GGML_ASSERT(a->ne[2] <= ne2);
    GGML_ASSERT(a->ne[3] <= ne3);
xuxzh1's avatar
init  
xuxzh1 committed
4044

xuxzh1's avatar
update  
xuxzh1 committed
4045
4046
4047
4048
4049
4050
    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);

    result->op     = GGML_OP_UPSCALE;
    result->src[0] = a;

    return result;
xuxzh1's avatar
init  
xuxzh1 committed
4051
4052
}

xuxzh1's avatar
update  
xuxzh1 committed
4053
4054
4055
4056
4057
struct ggml_tensor * ggml_upscale(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   scale_factor) {
    return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3]);
xuxzh1's avatar
init  
xuxzh1 committed
4058
4059
}

xuxzh1's avatar
update  
xuxzh1 committed
4060
4061
4062
4063
4064
4065
4066
4067
struct ggml_tensor * ggml_upscale_ext(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   ne0,
        int                   ne1,
        int                   ne2,
        int                   ne3) {
    return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3);
xuxzh1's avatar
init  
xuxzh1 committed
4068
4069
}

xuxzh1's avatar
update  
xuxzh1 committed
4070
// ggml_pad
xuxzh1's avatar
init  
xuxzh1 committed
4071

xuxzh1's avatar
update  
xuxzh1 committed
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
struct ggml_tensor * ggml_pad(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   p0,
        int                   p1,
        int                   p2,
        int                   p3) {
    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
            a->ne[0] + p0,
            a->ne[1] + p1,
            a->ne[2] + p2,
            a->ne[3] + p3);
xuxzh1's avatar
init  
xuxzh1 committed
4084

xuxzh1's avatar
update  
xuxzh1 committed
4085
4086
    result->op     = GGML_OP_PAD;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
4087

xuxzh1's avatar
update  
xuxzh1 committed
4088
4089
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
4090

xuxzh1's avatar
update  
xuxzh1 committed
4091
// ggml_arange
xuxzh1's avatar
init  
xuxzh1 committed
4092

xuxzh1's avatar
update  
xuxzh1 committed
4093
4094
4095
4096
4097
4098
struct ggml_tensor * ggml_arange(
        struct ggml_context * ctx,
        float                 start,
        float                 stop,
        float                 step) {
    GGML_ASSERT(stop > start);
xuxzh1's avatar
init  
xuxzh1 committed
4099

xuxzh1's avatar
update  
xuxzh1 committed
4100
    const int64_t steps = (int64_t) ceilf((stop - start) / step);
xuxzh1's avatar
init  
xuxzh1 committed
4101

xuxzh1's avatar
update  
xuxzh1 committed
4102
    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
xuxzh1's avatar
init  
xuxzh1 committed
4103

xuxzh1's avatar
update  
xuxzh1 committed
4104
4105
4106
    ggml_set_op_params_f32(result, 0, start);
    ggml_set_op_params_f32(result, 1, stop);
    ggml_set_op_params_f32(result, 2, step);
xuxzh1's avatar
init  
xuxzh1 committed
4107

xuxzh1's avatar
update  
xuxzh1 committed
4108
    result->op = GGML_OP_ARANGE;
xuxzh1's avatar
init  
xuxzh1 committed
4109

xuxzh1's avatar
update  
xuxzh1 committed
4110
    return result;
xuxzh1's avatar
init  
xuxzh1 committed
4111
4112
}

xuxzh1's avatar
update  
xuxzh1 committed
4113
// ggml_timestep_embedding
xuxzh1's avatar
init  
xuxzh1 committed
4114

xuxzh1's avatar
update  
xuxzh1 committed
4115
4116
4117
4118
4119
4120
4121
4122
struct ggml_tensor * ggml_timestep_embedding(
        struct ggml_context * ctx,
        struct ggml_tensor  * timesteps,
        int                   dim,
        int                   max_period) {
    int actual_dim = dim;
    if (dim % 2 != 0) {
        actual_dim = dim + 1;
xuxzh1's avatar
init  
xuxzh1 committed
4123
4124
    }

xuxzh1's avatar
update  
xuxzh1 committed
4125
    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]);
xuxzh1's avatar
init  
xuxzh1 committed
4126

xuxzh1's avatar
update  
xuxzh1 committed
4127
4128
    ggml_set_op_params_i32(result, 0, dim);
    ggml_set_op_params_i32(result, 1, max_period);
xuxzh1's avatar
init  
xuxzh1 committed
4129

xuxzh1's avatar
update  
xuxzh1 committed
4130
4131
    result->op     = GGML_OP_TIMESTEP_EMBEDDING;
    result->src[0] = timesteps;
xuxzh1's avatar
init  
xuxzh1 committed
4132
4133
4134
4135

    return result;
}

xuxzh1's avatar
update  
xuxzh1 committed
4136
// ggml_argsort
xuxzh1's avatar
init  
xuxzh1 committed
4137

xuxzh1's avatar
update  
xuxzh1 committed
4138
4139
4140
4141
4142
4143
struct ggml_tensor * ggml_argsort(
        struct ggml_context  * ctx,
        struct ggml_tensor   * a,
        enum ggml_sort_order   order) {
    GGML_ASSERT(a->ne[0] <= INT32_MAX);
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
xuxzh1's avatar
init  
xuxzh1 committed
4144

xuxzh1's avatar
update  
xuxzh1 committed
4145
    ggml_set_op_params_i32(result, 0, (int32_t) order);
xuxzh1's avatar
init  
xuxzh1 committed
4146

xuxzh1's avatar
update  
xuxzh1 committed
4147
4148
    result->op     = GGML_OP_ARGSORT;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
4149

xuxzh1's avatar
update  
xuxzh1 committed
4150
4151
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
4152

xuxzh1's avatar
update  
xuxzh1 committed
4153
// ggml_top_k
xuxzh1's avatar
init  
xuxzh1 committed
4154

xuxzh1's avatar
update  
xuxzh1 committed
4155
4156
4157
4158
4159
struct ggml_tensor * ggml_top_k(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   k) {
    GGML_ASSERT(a->ne[0] >= k);
xuxzh1's avatar
init  
xuxzh1 committed
4160

xuxzh1's avatar
update  
xuxzh1 committed
4161
    struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_ORDER_DESC);
xuxzh1's avatar
init  
xuxzh1 committed
4162

xuxzh1's avatar
update  
xuxzh1 committed
4163
4164
4165
4166
    result = ggml_view_4d(ctx, result,
                k, result->ne[1], result->ne[2], result->ne[3],
                   result->nb[1], result->nb[2], result->nb[3],
                0);
xuxzh1's avatar
init  
xuxzh1 committed
4167

xuxzh1's avatar
update  
xuxzh1 committed
4168
4169
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
4170

xuxzh1's avatar
update  
xuxzh1 committed
4171
// ggml_flash_attn_ext
xuxzh1's avatar
init  
xuxzh1 committed
4172

xuxzh1's avatar
update  
xuxzh1 committed
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
struct ggml_tensor * ggml_flash_attn_ext(
        struct ggml_context * ctx,
        struct ggml_tensor  * q,
        struct ggml_tensor  * k,
        struct ggml_tensor  * v,
        struct ggml_tensor  * mask,
        float                 scale,
        float                 max_bias,
        float                 logit_softcap) {
    GGML_ASSERT(ggml_can_mul_mat(k, q));
    // TODO: check if vT can be multiplied by (k*qT)
xuxzh1's avatar
init  
xuxzh1 committed
4184

xuxzh1's avatar
update  
xuxzh1 committed
4185
4186
4187
4188
4189
4190
4191
4192
    if (mask) {
        GGML_ASSERT(ggml_is_contiguous(mask));
        GGML_ASSERT(mask->ne[2] == 1);
        GGML_ASSERT(mask->ne[3] == 1);
        GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
                "the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
        //GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
    }
xuxzh1's avatar
init  
xuxzh1 committed
4193

xuxzh1's avatar
update  
xuxzh1 committed
4194
4195
4196
    if (max_bias > 0.0f) {
        GGML_ASSERT(mask);
    }
xuxzh1's avatar
init  
xuxzh1 committed
4197

xuxzh1's avatar
update  
xuxzh1 committed
4198
4199
4200
    // permute(0, 2, 1, 3)
    int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
xuxzh1's avatar
init  
xuxzh1 committed
4201

xuxzh1's avatar
update  
xuxzh1 committed
4202
4203
    float params[] = { scale, max_bias, logit_softcap };
    ggml_set_op_params(result, params, sizeof(params));
xuxzh1's avatar
init  
xuxzh1 committed
4204

xuxzh1's avatar
update  
xuxzh1 committed
4205
4206
4207
4208
4209
    result->op     = GGML_OP_FLASH_ATTN_EXT;
    result->src[0] = q;
    result->src[1] = k;
    result->src[2] = v;
    result->src[3] = mask;
xuxzh1's avatar
init  
xuxzh1 committed
4210

xuxzh1's avatar
update  
xuxzh1 committed
4211
4212
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
4213

xuxzh1's avatar
update  
xuxzh1 committed
4214
4215
4216
4217
void ggml_flash_attn_ext_set_prec(
        struct ggml_tensor * a,
        enum ggml_prec       prec) {
    GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
xuxzh1's avatar
init  
xuxzh1 committed
4218

xuxzh1's avatar
update  
xuxzh1 committed
4219
    const int32_t prec_i32 = (int32_t) prec;
xuxzh1's avatar
init  
xuxzh1 committed
4220

xuxzh1's avatar
update  
xuxzh1 committed
4221
4222
    ggml_set_op_params_i32(a, 3, prec_i32); // scale is on first pos, max_bias on second
}
xuxzh1's avatar
init  
xuxzh1 committed
4223

xuxzh1's avatar
update  
xuxzh1 committed
4224
4225
4226
enum ggml_prec ggml_flash_attn_ext_get_prec(
        const struct ggml_tensor * a) {
    GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
xuxzh1's avatar
init  
xuxzh1 committed
4227

xuxzh1's avatar
update  
xuxzh1 committed
4228
    const int32_t prec_i32 = ggml_get_op_params_i32(a, 3);
xuxzh1's avatar
init  
xuxzh1 committed
4229

xuxzh1's avatar
update  
xuxzh1 committed
4230
    return (enum ggml_prec) prec_i32;
xuxzh1's avatar
init  
xuxzh1 committed
4231
4232
}

xuxzh1's avatar
update  
xuxzh1 committed
4233
// ggml_flash_attn_back
xuxzh1's avatar
init  
xuxzh1 committed
4234

xuxzh1's avatar
update  
xuxzh1 committed
4235
4236
4237
4238
4239
4240
4241
4242
struct ggml_tensor * ggml_flash_attn_back(
        struct ggml_context * ctx,
        struct ggml_tensor  * q,
        struct ggml_tensor  * k,
        struct ggml_tensor  * v,
        struct ggml_tensor  * d,
        bool                  masked) {
    GGML_ABORT("TODO: adapt to ggml_flash_attn_ext() changes");
xuxzh1's avatar
init  
xuxzh1 committed
4243

xuxzh1's avatar
update  
xuxzh1 committed
4244
4245
    GGML_ASSERT(ggml_can_mul_mat(k, q));
    // TODO: check if vT can be multiplied by (k*qT)
xuxzh1's avatar
init  
xuxzh1 committed
4246

xuxzh1's avatar
update  
xuxzh1 committed
4247
4248
4249
4250
    // d shape [D,N,ne2,ne3]
    // q shape [D,N,ne2,ne3]
    // k shape [D,M,kvne2,ne3]
    // v shape [M,D,kvne2,ne3]
xuxzh1's avatar
init  
xuxzh1 committed
4251

xuxzh1's avatar
update  
xuxzh1 committed
4252
4253
4254
4255
4256
4257
    const int64_t     D = q->ne[0];
    const int64_t     N = q->ne[1];
    const int64_t     M = k->ne[1];
    const int64_t   ne2 = q->ne[2];
    const int64_t   ne3 = q->ne[3];
    const int64_t kvne2 = k->ne[2];
xuxzh1's avatar
init  
xuxzh1 committed
4258

xuxzh1's avatar
update  
xuxzh1 committed
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
    GGML_ASSERT(k->ne[0] == D);
    GGML_ASSERT(v->ne[0] == M);
    GGML_ASSERT(v->ne[1] == D);
    GGML_ASSERT(d->ne[0] == D);
    GGML_ASSERT(d->ne[1] == N);
    GGML_ASSERT(k->ne[2] == kvne2);
    GGML_ASSERT(k->ne[3] == ne3);
    GGML_ASSERT(v->ne[2] == kvne2);
    GGML_ASSERT(v->ne[3] == ne3);
    GGML_ASSERT(d->ne[2] == ne2);
    GGML_ASSERT(d->ne[3] == ne3);
xuxzh1's avatar
init  
xuxzh1 committed
4270

xuxzh1's avatar
update  
xuxzh1 committed
4271
    GGML_ASSERT(ne2 % kvne2 == 0);
xuxzh1's avatar
init  
xuxzh1 committed
4272

xuxzh1's avatar
update  
xuxzh1 committed
4273
4274
4275
4276
4277
    // store gradients of q, k and v as continuous tensors concatenated in result.
    // note: v and gradv are actually transposed, i.e. v->ne[0] != D.
    const int64_t elem_q = ggml_nelements(q);
    const int64_t elem_k = ggml_nelements(k);
    const int64_t elem_v = ggml_nelements(v);
xuxzh1's avatar
init  
xuxzh1 committed
4278

xuxzh1's avatar
update  
xuxzh1 committed
4279
4280
4281
    enum ggml_type result_type = GGML_TYPE_F32;
    GGML_ASSERT(ggml_blck_size(result_type) == 1);
    const size_t tsize = ggml_type_size(result_type);
xuxzh1's avatar
init  
xuxzh1 committed
4282

xuxzh1's avatar
update  
xuxzh1 committed
4283
4284
4285
4286
    const size_t offs_q = 0;
    const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
    const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
    const size_t end    = offs_v + GGML_PAD(elem_v * tsize, GGML_MEM_ALIGN);
xuxzh1's avatar
init  
xuxzh1 committed
4287

xuxzh1's avatar
update  
xuxzh1 committed
4288
    const size_t nelements = (end + tsize - 1)/tsize;
xuxzh1's avatar
init  
xuxzh1 committed
4289

xuxzh1's avatar
update  
xuxzh1 committed
4290
    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nelements);
xuxzh1's avatar
init  
xuxzh1 committed
4291

xuxzh1's avatar
update  
xuxzh1 committed
4292
4293
    int32_t masked_i = masked ? 1 : 0;
    ggml_set_op_params(result, &masked_i, sizeof(masked_i));
xuxzh1's avatar
init  
xuxzh1 committed
4294

xuxzh1's avatar
update  
xuxzh1 committed
4295
4296
4297
4298
4299
    result->op     = GGML_OP_FLASH_ATTN_BACK;
    result->src[0] = q;
    result->src[1] = k;
    result->src[2] = v;
    result->src[3] = d;
xuxzh1's avatar
init  
xuxzh1 committed
4300

xuxzh1's avatar
update  
xuxzh1 committed
4301
4302
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
4303

xuxzh1's avatar
update  
xuxzh1 committed
4304
// ggml_ssm_conv
xuxzh1's avatar
init  
xuxzh1 committed
4305

xuxzh1's avatar
update  
xuxzh1 committed
4306
4307
4308
4309
4310
4311
struct ggml_tensor * ggml_ssm_conv(
        struct ggml_context * ctx,
        struct ggml_tensor  * sx,
        struct ggml_tensor  * c) {
    GGML_ASSERT(ggml_is_3d(sx));
    GGML_ASSERT(ggml_is_matrix(c));
xuxzh1's avatar
init  
xuxzh1 committed
4312

xuxzh1's avatar
update  
xuxzh1 committed
4313
4314
4315
4316
    const int64_t d_conv  = c->ne[0];
    const int64_t d_inner = c->ne[1];
    const int64_t n_t     = sx->ne[0] - d_conv + 1; // tokens per sequence
    const int64_t n_s     = sx->ne[2];
xuxzh1's avatar
init  
xuxzh1 committed
4317

xuxzh1's avatar
update  
xuxzh1 committed
4318
4319
4320
4321
4322
    // TODO: maybe support other strides than 1?
    // FIXME: this is always true?
    GGML_ASSERT(sx->ne[0] == d_conv - 1 + n_t);
    GGML_ASSERT(sx->ne[1] == d_inner);
    GGML_ASSERT(n_t >= 0);
xuxzh1's avatar
init  
xuxzh1 committed
4323

xuxzh1's avatar
update  
xuxzh1 committed
4324
    struct ggml_tensor * result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_t, n_s);
xuxzh1's avatar
init  
xuxzh1 committed
4325

xuxzh1's avatar
update  
xuxzh1 committed
4326
4327
4328
    result->op     = GGML_OP_SSM_CONV;
    result->src[0] = sx;
    result->src[1] = c;
xuxzh1's avatar
init  
xuxzh1 committed
4329

xuxzh1's avatar
update  
xuxzh1 committed
4330
4331
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
4332

xuxzh1's avatar
update  
xuxzh1 committed
4333
// ggml_ssm_scan
xuxzh1's avatar
init  
xuxzh1 committed
4334

xuxzh1's avatar
update  
xuxzh1 committed
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
struct ggml_tensor * ggml_ssm_scan(
        struct ggml_context * ctx,
        struct ggml_tensor  * s,
        struct ggml_tensor  * x,
        struct ggml_tensor  * dt,
        struct ggml_tensor  * A,
        struct ggml_tensor  * B,
        struct ggml_tensor  * C) {
    GGML_ASSERT(ggml_is_contiguous(s));
    GGML_ASSERT(ggml_is_contiguous(x));
    GGML_ASSERT(ggml_is_contiguous(dt));
    GGML_ASSERT(ggml_is_contiguous(A));
    GGML_ASSERT(ggml_is_matrix(A));
    GGML_ASSERT(ggml_is_3d(B));
    GGML_ASSERT(ggml_is_3d(s));
    GGML_ASSERT(B->nb[0] == ggml_type_size(B->type));
    GGML_ASSERT(C->nb[0] == ggml_type_size(C->type));
    GGML_ASSERT(ggml_are_same_shape(x, dt));
    GGML_ASSERT(ggml_are_same_shape(B, C));
xuxzh1's avatar
init  
xuxzh1 committed
4354

xuxzh1's avatar
update  
xuxzh1 committed
4355
4356
4357
4358
4359
    {
        const int64_t d_state      = s->ne[0];
        const int64_t d_inner      = s->ne[1];
        const int64_t n_seq_tokens = x->ne[1];
        const int64_t n_seqs       = x->ne[2];
xuxzh1's avatar
init  
xuxzh1 committed
4360

xuxzh1's avatar
update  
xuxzh1 committed
4361
4362
4363
4364
4365
4366
4367
        GGML_ASSERT(s->ne[2] == n_seqs);
        GGML_ASSERT(x->ne[0] == d_inner);
        GGML_ASSERT(A->ne[0] == d_state);
        GGML_ASSERT(A->ne[1] == d_inner);
        GGML_ASSERT(B->ne[0] == d_state);
        GGML_ASSERT(B->ne[1] == n_seq_tokens);
        GGML_ASSERT(B->ne[2] == n_seqs);
xuxzh1's avatar
init  
xuxzh1 committed
4368
4369
    }

xuxzh1's avatar
update  
xuxzh1 committed
4370
4371
    // concatenated y + ssm_states
    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + ggml_nelements(s));
xuxzh1's avatar
init  
xuxzh1 committed
4372

xuxzh1's avatar
update  
xuxzh1 committed
4373
4374
4375
4376
4377
4378
4379
    result->op   = GGML_OP_SSM_SCAN;
    result->src[0] = s;
    result->src[1] = x;
    result->src[2] = dt;
    result->src[3] = A;
    result->src[4] = B;
    result->src[5] = C;
xuxzh1's avatar
init  
xuxzh1 committed
4380

xuxzh1's avatar
update  
xuxzh1 committed
4381
    return result;
xuxzh1's avatar
init  
xuxzh1 committed
4382
4383
}

xuxzh1's avatar
update  
xuxzh1 committed
4384
// ggml_win_part
xuxzh1's avatar
init  
xuxzh1 committed
4385

xuxzh1's avatar
update  
xuxzh1 committed
4386
4387
4388
4389
4390
4391
struct ggml_tensor * ggml_win_part(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   w) {
    GGML_ASSERT(a->ne[3] == 1);
    GGML_ASSERT(a->type  == GGML_TYPE_F32);
xuxzh1's avatar
init  
xuxzh1 committed
4392

xuxzh1's avatar
update  
xuxzh1 committed
4393
4394
4395
    // padding
    const int px = (w - a->ne[1]%w)%w;
    const int py = (w - a->ne[2]%w)%w;
xuxzh1's avatar
init  
xuxzh1 committed
4396

xuxzh1's avatar
update  
xuxzh1 committed
4397
4398
4399
    const int npx = (px + a->ne[1])/w;
    const int npy = (py + a->ne[2])/w;
    const int np  = npx*npy;
xuxzh1's avatar
init  
xuxzh1 committed
4400

xuxzh1's avatar
update  
xuxzh1 committed
4401
4402
    const int64_t ne[4] = { a->ne[0], w, w, np, };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
xuxzh1's avatar
init  
xuxzh1 committed
4403

xuxzh1's avatar
update  
xuxzh1 committed
4404
4405
    int32_t params[] = { npx, npy, w };
    ggml_set_op_params(result, params, sizeof(params));
xuxzh1's avatar
init  
xuxzh1 committed
4406

xuxzh1's avatar
update  
xuxzh1 committed
4407
4408
    result->op     = GGML_OP_WIN_PART;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
4409

xuxzh1's avatar
update  
xuxzh1 committed
4410
    return result;
xuxzh1's avatar
init  
xuxzh1 committed
4411
4412
}

xuxzh1's avatar
update  
xuxzh1 committed
4413
// ggml_win_unpart
xuxzh1's avatar
init  
xuxzh1 committed
4414

xuxzh1's avatar
update  
xuxzh1 committed
4415
4416
4417
4418
4419
4420
4421
struct ggml_tensor * ggml_win_unpart(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   w0,
        int                   h0,
        int                   w) {
    GGML_ASSERT(a->type == GGML_TYPE_F32);
xuxzh1's avatar
init  
xuxzh1 committed
4422

xuxzh1's avatar
update  
xuxzh1 committed
4423
4424
    const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
xuxzh1's avatar
init  
xuxzh1 committed
4425

xuxzh1's avatar
update  
xuxzh1 committed
4426
4427
    int32_t params[] = { w };
    ggml_set_op_params(result, params, sizeof(params));
xuxzh1's avatar
init  
xuxzh1 committed
4428

xuxzh1's avatar
update  
xuxzh1 committed
4429
4430
    result->op     = GGML_OP_WIN_UNPART;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
4431

xuxzh1's avatar
update  
xuxzh1 committed
4432
4433
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
4434

xuxzh1's avatar
update  
xuxzh1 committed
4435
// ggml_get_rel_pos
xuxzh1's avatar
init  
xuxzh1 committed
4436

xuxzh1's avatar
update  
xuxzh1 committed
4437
4438
4439
4440
4441
4442
4443
struct ggml_tensor * ggml_get_rel_pos(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        int                   qh,
        int                   kh) {
    GGML_ASSERT(qh == kh);
    GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]);
xuxzh1's avatar
init  
xuxzh1 committed
4444

xuxzh1's avatar
update  
xuxzh1 committed
4445
4446
    const int64_t ne[4] = { a->ne[0], kh, qh, 1, };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne);
xuxzh1's avatar
init  
xuxzh1 committed
4447

xuxzh1's avatar
update  
xuxzh1 committed
4448
4449
    result->op     = GGML_OP_GET_REL_POS;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
4450

xuxzh1's avatar
update  
xuxzh1 committed
4451
    return result;
xuxzh1's avatar
init  
xuxzh1 committed
4452
4453
}

xuxzh1's avatar
update  
xuxzh1 committed
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
// ggml_add_rel_pos

static struct ggml_tensor * ggml_add_rel_pos_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * pw,
        struct ggml_tensor  * ph,
        bool                  inplace) {
    GGML_ASSERT(ggml_are_same_shape(pw, ph));
    GGML_ASSERT(ggml_is_contiguous(a));
    GGML_ASSERT(ggml_is_contiguous(pw));
    GGML_ASSERT(ggml_is_contiguous(ph));
    GGML_ASSERT(ph->type == GGML_TYPE_F32);
    GGML_ASSERT(pw->type == GGML_TYPE_F32);
    GGML_ASSERT(pw->ne[3] == a->ne[2]);
    GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]);
    GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]);
xuxzh1's avatar
init  
xuxzh1 committed
4471

xuxzh1's avatar
update  
xuxzh1 committed
4472
4473
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    ggml_set_op_params_i32(result, 0, inplace ? 1 : 0);
xuxzh1's avatar
init  
xuxzh1 committed
4474

xuxzh1's avatar
update  
xuxzh1 committed
4475
4476
4477
4478
    result->op     = GGML_OP_ADD_REL_POS;
    result->src[0] = a;
    result->src[1] = pw;
    result->src[2] = ph;
xuxzh1's avatar
init  
xuxzh1 committed
4479

xuxzh1's avatar
update  
xuxzh1 committed
4480
    return result;
xuxzh1's avatar
init  
xuxzh1 committed
4481
4482
}

xuxzh1's avatar
update  
xuxzh1 committed
4483
4484
4485
4486
4487
4488
4489
struct ggml_tensor * ggml_add_rel_pos(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * pw,
        struct ggml_tensor  * ph) {
    return ggml_add_rel_pos_impl(ctx, a, pw, ph, false);
}
xuxzh1's avatar
init  
xuxzh1 committed
4490

xuxzh1's avatar
update  
xuxzh1 committed
4491
4492
4493
4494
4495
4496
4497
struct ggml_tensor * ggml_add_rel_pos_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * pw,
        struct ggml_tensor  * ph) {
    return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
}
xuxzh1's avatar
init  
xuxzh1 committed
4498

xuxzh1's avatar
update  
xuxzh1 committed
4499
// ggml_rwkv_wkv6
xuxzh1's avatar
init  
xuxzh1 committed
4500

xuxzh1's avatar
update  
xuxzh1 committed
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
struct ggml_tensor * ggml_rwkv_wkv6(
        struct ggml_context * ctx,
        struct ggml_tensor  * k,
        struct ggml_tensor  * v,
        struct ggml_tensor  * r,
        struct ggml_tensor  * tf,
        struct ggml_tensor  * td,
        struct ggml_tensor  * state) {
    GGML_ASSERT(ggml_is_contiguous(k));
    GGML_ASSERT(ggml_is_contiguous(v));
    GGML_ASSERT(ggml_is_contiguous(r));
    GGML_ASSERT(ggml_is_contiguous(tf));
    GGML_ASSERT(ggml_is_contiguous(td));
    GGML_ASSERT(ggml_is_contiguous(state));

    const int64_t S = k->ne[0];
    const int64_t H = k->ne[2];
    const int64_t n_tokens = k->ne[3];
    const int64_t n_seqs = state->ne[1];
    {
        GGML_ASSERT(k->ne[1] == 1);
        GGML_ASSERT(v->ne[0] == 1 && v->ne[1] == S && v->ne[2] == H && v->ne[3] == n_tokens);
        GGML_ASSERT(r->ne[0] == 1 && r->ne[1] == S && r->ne[2] == H && r->ne[3] == n_tokens);
        // TODO: RWKV v4 and v5
        GGML_ASSERT(td->ne[0] == 1 && td->ne[1] == S && td->ne[2] == H && td->ne[3] == n_tokens);
        GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs);
xuxzh1's avatar
init  
xuxzh1 committed
4527
4528
    }

xuxzh1's avatar
update  
xuxzh1 committed
4529
4530
4531
    // concat output and new_state
    const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 };
    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
xuxzh1's avatar
init  
xuxzh1 committed
4532

xuxzh1's avatar
update  
xuxzh1 committed
4533
4534
4535
4536
4537
4538
4539
    result->op     = GGML_OP_RWKV_WKV6;
    result->src[0] = k;
    result->src[1] = v;
    result->src[2] = r;
    result->src[3] = tf;
    result->src[4] = td;
    result->src[5] = state;
xuxzh1's avatar
init  
xuxzh1 committed
4540

xuxzh1's avatar
update  
xuxzh1 committed
4541
    return result;
xuxzh1's avatar
init  
xuxzh1 committed
4542
4543
}

xuxzh1's avatar
update  
xuxzh1 committed
4544
// ggml_unary
xuxzh1's avatar
init  
xuxzh1 committed
4545

xuxzh1's avatar
update  
xuxzh1 committed
4546
4547
4548
4549
4550
4551
static struct ggml_tensor * ggml_unary_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        enum ggml_unary_op    op,
        bool                  inplace) {
    GGML_ASSERT(ggml_is_contiguous_1(a));
xuxzh1's avatar
init  
xuxzh1 committed
4552

xuxzh1's avatar
update  
xuxzh1 committed
4553
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
4554

xuxzh1's avatar
update  
xuxzh1 committed
4555
    ggml_set_op_params_i32(result, 0, (int32_t) op);
xuxzh1's avatar
init  
xuxzh1 committed
4556

xuxzh1's avatar
update  
xuxzh1 committed
4557
4558
    result->op     = GGML_OP_UNARY;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
4559

xuxzh1's avatar
update  
xuxzh1 committed
4560
4561
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
4562

xuxzh1's avatar
update  
xuxzh1 committed
4563
4564
4565
4566
4567
4568
struct ggml_tensor * ggml_unary(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        enum ggml_unary_op    op) {
    return ggml_unary_impl(ctx, a, op, false);
}
xuxzh1's avatar
init  
xuxzh1 committed
4569

xuxzh1's avatar
update  
xuxzh1 committed
4570
4571
4572
4573
4574
4575
struct ggml_tensor * ggml_unary_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        enum ggml_unary_op    op) {
    return ggml_unary_impl(ctx, a, op, true);
}
xuxzh1's avatar
init  
xuxzh1 committed
4576

xuxzh1's avatar
update  
xuxzh1 committed
4577
// ggml_map_unary
xuxzh1's avatar
init  
xuxzh1 committed
4578

xuxzh1's avatar
update  
xuxzh1 committed
4579
4580
4581
4582
4583
4584
static struct ggml_tensor * ggml_map_unary_impl_f32(
        struct ggml_context        * ctx,
        struct ggml_tensor         * a,
        const  ggml_unary_op_f32_t   fun,
        bool                         inplace) {
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
4585

xuxzh1's avatar
update  
xuxzh1 committed
4586
    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
xuxzh1's avatar
init  
xuxzh1 committed
4587

xuxzh1's avatar
update  
xuxzh1 committed
4588
4589
    result->op     = GGML_OP_MAP_UNARY;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
4590

xuxzh1's avatar
update  
xuxzh1 committed
4591
4592
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
4593

xuxzh1's avatar
update  
xuxzh1 committed
4594
4595
4596
4597
4598
4599
struct ggml_tensor * ggml_map_unary_f32(
        struct ggml_context        * ctx,
        struct ggml_tensor         * a,
        const  ggml_unary_op_f32_t   fun) {
    return ggml_map_unary_impl_f32(ctx, a, fun, false);
}
xuxzh1's avatar
init  
xuxzh1 committed
4600

xuxzh1's avatar
update  
xuxzh1 committed
4601
4602
4603
4604
4605
4606
struct ggml_tensor * ggml_map_unary_inplace_f32(
        struct ggml_context        * ctx,
        struct ggml_tensor         * a,
        const  ggml_unary_op_f32_t   fun) {
    return ggml_map_unary_impl_f32(ctx, a, fun, true);
}
xuxzh1's avatar
init  
xuxzh1 committed
4607

xuxzh1's avatar
update  
xuxzh1 committed
4608
// ggml_map_binary
xuxzh1's avatar
init  
xuxzh1 committed
4609

xuxzh1's avatar
update  
xuxzh1 committed
4610
4611
4612
4613
4614
4615
4616
static struct ggml_tensor * ggml_map_binary_impl_f32(
        struct ggml_context         * ctx,
        struct ggml_tensor          * a,
        struct ggml_tensor          * b,
        const  ggml_binary_op_f32_t   fun,
        bool                          inplace) {
    GGML_ASSERT(ggml_are_same_shape(a, b));
xuxzh1's avatar
init  
xuxzh1 committed
4617

xuxzh1's avatar
update  
xuxzh1 committed
4618
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
4619

xuxzh1's avatar
update  
xuxzh1 committed
4620
    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
xuxzh1's avatar
init  
xuxzh1 committed
4621

xuxzh1's avatar
update  
xuxzh1 committed
4622
4623
4624
    result->op     = GGML_OP_MAP_BINARY;
    result->src[0] = a;
    result->src[1] = b;
xuxzh1's avatar
init  
xuxzh1 committed
4625

xuxzh1's avatar
update  
xuxzh1 committed
4626
4627
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
4628

xuxzh1's avatar
update  
xuxzh1 committed
4629
4630
4631
4632
4633
4634
4635
struct ggml_tensor * ggml_map_binary_f32(
        struct ggml_context         * ctx,
        struct ggml_tensor          * a,
        struct ggml_tensor          * b,
        const  ggml_binary_op_f32_t   fun) {
    return ggml_map_binary_impl_f32(ctx, a, b, fun, false);
}
xuxzh1's avatar
init  
xuxzh1 committed
4636

xuxzh1's avatar
update  
xuxzh1 committed
4637
4638
4639
4640
4641
4642
4643
struct ggml_tensor * ggml_map_binary_inplace_f32(
        struct ggml_context         * ctx,
        struct ggml_tensor          * a,
        struct ggml_tensor          * b,
        const  ggml_binary_op_f32_t   fun) {
    return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
}
xuxzh1's avatar
init  
xuxzh1 committed
4644

xuxzh1's avatar
update  
xuxzh1 committed
4645
// ggml_map_custom1_f32
xuxzh1's avatar
init  
xuxzh1 committed
4646

xuxzh1's avatar
update  
xuxzh1 committed
4647
4648
4649
4650
4651
4652
static struct ggml_tensor * ggml_map_custom1_impl_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        const  ggml_custom1_op_f32_t   fun,
        bool                           inplace) {
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
4653

xuxzh1's avatar
update  
xuxzh1 committed
4654
    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
xuxzh1's avatar
init  
xuxzh1 committed
4655

xuxzh1's avatar
update  
xuxzh1 committed
4656
4657
    result->op     = GGML_OP_MAP_CUSTOM1_F32;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
4658

xuxzh1's avatar
update  
xuxzh1 committed
4659
4660
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
4661

xuxzh1's avatar
update  
xuxzh1 committed
4662
4663
4664
4665
4666
4667
struct ggml_tensor * ggml_map_custom1_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        const  ggml_custom1_op_f32_t   fun) {
    return ggml_map_custom1_impl_f32(ctx, a, fun, false);
}
xuxzh1's avatar
init  
xuxzh1 committed
4668

xuxzh1's avatar
update  
xuxzh1 committed
4669
4670
4671
4672
4673
4674
struct ggml_tensor * ggml_map_custom1_inplace_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        const  ggml_custom1_op_f32_t   fun) {
    return ggml_map_custom1_impl_f32(ctx, a, fun, true);
}
xuxzh1's avatar
init  
xuxzh1 committed
4675

xuxzh1's avatar
update  
xuxzh1 committed
4676
// ggml_map_custom2_f32
xuxzh1's avatar
init  
xuxzh1 committed
4677

xuxzh1's avatar
update  
xuxzh1 committed
4678
4679
4680
4681
4682
4683
4684
static struct ggml_tensor * ggml_map_custom2_impl_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        const  ggml_custom2_op_f32_t   fun,
        bool                           inplace) {
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
4685

xuxzh1's avatar
update  
xuxzh1 committed
4686
    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
xuxzh1's avatar
init  
xuxzh1 committed
4687

xuxzh1's avatar
update  
xuxzh1 committed
4688
4689
4690
    result->op     = GGML_OP_MAP_CUSTOM2_F32;
    result->src[0] = a;
    result->src[1] = b;
xuxzh1's avatar
init  
xuxzh1 committed
4691

xuxzh1's avatar
update  
xuxzh1 committed
4692
    return result;
xuxzh1's avatar
init  
xuxzh1 committed
4693
4694
}

xuxzh1's avatar
update  
xuxzh1 committed
4695
4696
4697
4698
4699
4700
4701
struct ggml_tensor * ggml_map_custom2_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        const  ggml_custom2_op_f32_t   fun) {
    return ggml_map_custom2_impl_f32(ctx, a, b, fun, false);
}
xuxzh1's avatar
init  
xuxzh1 committed
4702

xuxzh1's avatar
update  
xuxzh1 committed
4703
4704
4705
4706
4707
4708
4709
struct ggml_tensor * ggml_map_custom2_inplace_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        const  ggml_custom2_op_f32_t   fun) {
    return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
}
xuxzh1's avatar
init  
xuxzh1 committed
4710

xuxzh1's avatar
update  
xuxzh1 committed
4711
// ggml_map_custom3_f32
xuxzh1's avatar
init  
xuxzh1 committed
4712

xuxzh1's avatar
update  
xuxzh1 committed
4713
4714
4715
4716
4717
4718
4719
4720
static struct ggml_tensor * ggml_map_custom3_impl_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        struct ggml_tensor           * c,
        const  ggml_custom3_op_f32_t   fun,
        bool                           inplace) {
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
4721

xuxzh1's avatar
update  
xuxzh1 committed
4722
    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
xuxzh1's avatar
init  
xuxzh1 committed
4723

xuxzh1's avatar
update  
xuxzh1 committed
4724
4725
4726
4727
    result->op     = GGML_OP_MAP_CUSTOM3_F32;
    result->src[0] = a;
    result->src[1] = b;
    result->src[2] = c;
xuxzh1's avatar
init  
xuxzh1 committed
4728

xuxzh1's avatar
update  
xuxzh1 committed
4729
4730
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
4731

xuxzh1's avatar
update  
xuxzh1 committed
4732
4733
4734
4735
4736
4737
4738
4739
struct ggml_tensor * ggml_map_custom3_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        struct ggml_tensor           * c,
        const  ggml_custom3_op_f32_t   fun) {
    return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false);
}
xuxzh1's avatar
init  
xuxzh1 committed
4740

xuxzh1's avatar
update  
xuxzh1 committed
4741
4742
4743
4744
4745
4746
4747
4748
struct ggml_tensor * ggml_map_custom3_inplace_f32(
        struct ggml_context          * ctx,
        struct ggml_tensor           * a,
        struct ggml_tensor           * b,
        struct ggml_tensor           * c,
        const  ggml_custom3_op_f32_t   fun) {
    return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
}
xuxzh1's avatar
init  
xuxzh1 committed
4749

xuxzh1's avatar
update  
xuxzh1 committed
4750
// ggml_map_custom1
xuxzh1's avatar
init  
xuxzh1 committed
4751

xuxzh1's avatar
update  
xuxzh1 committed
4752
4753
4754
4755
4756
4757
4758
4759
static struct ggml_tensor * ggml_map_custom1_impl(
        struct ggml_context      * ctx,
        struct ggml_tensor       * a,
        const  ggml_custom1_op_t   fun,
        int                        n_tasks,
        void                     * userdata,
        bool                       inplace) {
    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
xuxzh1's avatar
init  
xuxzh1 committed
4760

xuxzh1's avatar
update  
xuxzh1 committed
4761
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
4762

xuxzh1's avatar
update  
xuxzh1 committed
4763
4764
4765
4766
4767
4768
    struct ggml_map_custom1_op_params params = {
        /*.fun      =*/ fun,
        /*.n_tasks  =*/ n_tasks,
        /*.userdata =*/ userdata
    };
    ggml_set_op_params(result, (const void *) &params, sizeof(params));
xuxzh1's avatar
init  
xuxzh1 committed
4769

xuxzh1's avatar
update  
xuxzh1 committed
4770
4771
    result->op     = GGML_OP_MAP_CUSTOM1;
    result->src[0] = a;
xuxzh1's avatar
init  
xuxzh1 committed
4772

xuxzh1's avatar
update  
xuxzh1 committed
4773
4774
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
4775

xuxzh1's avatar
update  
xuxzh1 committed
4776
4777
4778
4779
4780
4781
4782
4783
struct ggml_tensor * ggml_map_custom1(
        struct ggml_context      * ctx,
        struct ggml_tensor       * a,
        const  ggml_custom1_op_t   fun,
        int                        n_tasks,
        void                     * userdata) {
    return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
}
xuxzh1's avatar
init  
xuxzh1 committed
4784

xuxzh1's avatar
update  
xuxzh1 committed
4785
4786
4787
4788
4789
4790
4791
4792
struct ggml_tensor * ggml_map_custom1_inplace(
        struct ggml_context      * ctx,
        struct ggml_tensor       * a,
        const  ggml_custom1_op_t   fun,
        int                        n_tasks,
        void                     * userdata) {
    return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
}
xuxzh1's avatar
init  
xuxzh1 committed
4793

xuxzh1's avatar
update  
xuxzh1 committed
4794
// ggml_map_custom2
xuxzh1's avatar
init  
xuxzh1 committed
4795

xuxzh1's avatar
update  
xuxzh1 committed
4796
4797
4798
4799
4800
4801
4802
4803
4804
static struct ggml_tensor * ggml_map_custom2_impl(
        struct ggml_context      * ctx,
        struct ggml_tensor       * a,
        struct ggml_tensor       * b,
        const  ggml_custom2_op_t   fun,
        int                        n_tasks,
        void                     * userdata,
        bool                       inplace) {
    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
xuxzh1's avatar
init  
xuxzh1 committed
4805

xuxzh1's avatar
update  
xuxzh1 committed
4806
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
4807

xuxzh1's avatar
update  
xuxzh1 committed
4808
4809
4810
4811
4812
4813
    struct ggml_map_custom2_op_params params = {
        /*.fun      =*/ fun,
        /*.n_tasks  =*/ n_tasks,
        /*.userdata =*/ userdata
    };
    ggml_set_op_params(result, (const void *) &params, sizeof(params));
xuxzh1's avatar
init  
xuxzh1 committed
4814

xuxzh1's avatar
update  
xuxzh1 committed
4815
4816
4817
    result->op     = GGML_OP_MAP_CUSTOM2;
    result->src[0] = a;
    result->src[1] = b;
xuxzh1's avatar
init  
xuxzh1 committed
4818

xuxzh1's avatar
update  
xuxzh1 committed
4819
4820
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
4821

xuxzh1's avatar
update  
xuxzh1 committed
4822
4823
4824
4825
4826
4827
4828
4829
4830
struct ggml_tensor * ggml_map_custom2(
        struct ggml_context      * ctx,
        struct ggml_tensor       * a,
        struct ggml_tensor       * b,
        const  ggml_custom2_op_t   fun,
        int                        n_tasks,
        void                     * userdata) {
    return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
}
xuxzh1's avatar
init  
xuxzh1 committed
4831

xuxzh1's avatar
update  
xuxzh1 committed
4832
4833
4834
4835
4836
4837
4838
4839
4840
struct ggml_tensor * ggml_map_custom2_inplace(
        struct ggml_context      * ctx,
        struct ggml_tensor       * a,
        struct ggml_tensor       * b,
        const  ggml_custom2_op_t   fun,
        int                        n_tasks,
        void                     * userdata) {
    return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
}
xuxzh1's avatar
init  
xuxzh1 committed
4841

xuxzh1's avatar
update  
xuxzh1 committed
4842
// ggml_map_custom3
xuxzh1's avatar
init  
xuxzh1 committed
4843

xuxzh1's avatar
update  
xuxzh1 committed
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
static struct ggml_tensor * ggml_map_custom3_impl(
        struct ggml_context      * ctx,
        struct ggml_tensor       * a,
        struct ggml_tensor       * b,
        struct ggml_tensor       * c,
        const  ggml_custom3_op_t   fun,
        int                        n_tasks,
        void                     * userdata,
        bool                       inplace) {
    GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
xuxzh1's avatar
init  
xuxzh1 committed
4854

xuxzh1's avatar
update  
xuxzh1 committed
4855
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
4856

xuxzh1's avatar
update  
xuxzh1 committed
4857
4858
4859
4860
4861
4862
    struct ggml_map_custom3_op_params params = {
        /*.fun      =*/ fun,
        /*.n_tasks  =*/ n_tasks,
        /*.userdata =*/ userdata
    };
    ggml_set_op_params(result, (const void *) &params, sizeof(params));
xuxzh1's avatar
init  
xuxzh1 committed
4863

xuxzh1's avatar
update  
xuxzh1 committed
4864
4865
4866
4867
    result->op     = GGML_OP_MAP_CUSTOM3;
    result->src[0] = a;
    result->src[1] = b;
    result->src[2] = c;
xuxzh1's avatar
init  
xuxzh1 committed
4868

xuxzh1's avatar
update  
xuxzh1 committed
4869
4870
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
4871

xuxzh1's avatar
update  
xuxzh1 committed
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
struct ggml_tensor * ggml_map_custom3(
        struct ggml_context      * ctx,
        struct ggml_tensor       * a,
        struct ggml_tensor       * b,
        struct ggml_tensor       * c,
        const  ggml_custom3_op_t   fun,
        int                        n_tasks,
        void                     * userdata) {
    return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
}
xuxzh1's avatar
init  
xuxzh1 committed
4882

xuxzh1's avatar
update  
xuxzh1 committed
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
struct ggml_tensor * ggml_map_custom3_inplace(
        struct ggml_context      * ctx,
        struct ggml_tensor       * a,
        struct ggml_tensor       * b,
        struct ggml_tensor       * c,
        const  ggml_custom3_op_t   fun,
        int                        n_tasks,
        void                     * userdata) {
    return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
}
xuxzh1's avatar
init  
xuxzh1 committed
4893

xuxzh1's avatar
update  
xuxzh1 committed
4894
// ggml_cross_entropy_loss
xuxzh1's avatar
init  
xuxzh1 committed
4895

xuxzh1's avatar
update  
xuxzh1 committed
4896
4897
4898
4899
4900
struct ggml_tensor * ggml_cross_entropy_loss(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b) {
    GGML_ASSERT(ggml_are_same_shape(a, b));
xuxzh1's avatar
init  
xuxzh1 committed
4901

xuxzh1's avatar
update  
xuxzh1 committed
4902
    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);
xuxzh1's avatar
init  
xuxzh1 committed
4903

xuxzh1's avatar
update  
xuxzh1 committed
4904
4905
4906
    result->op     = GGML_OP_CROSS_ENTROPY_LOSS;
    result->src[0] = a;
    result->src[1] = b;
xuxzh1's avatar
init  
xuxzh1 committed
4907

xuxzh1's avatar
update  
xuxzh1 committed
4908
4909
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
4910

xuxzh1's avatar
update  
xuxzh1 committed
4911
// ggml_cross_entropy_loss_back
xuxzh1's avatar
init  
xuxzh1 committed
4912

xuxzh1's avatar
update  
xuxzh1 committed
4913
4914
4915
4916
4917
4918
4919
struct ggml_tensor * ggml_cross_entropy_loss_back(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        struct ggml_tensor  * c) {
    GGML_ASSERT(ggml_are_same_shape(a, b));
    GGML_ASSERT(ggml_is_scalar(c));
xuxzh1's avatar
init  
xuxzh1 committed
4920

xuxzh1's avatar
update  
xuxzh1 committed
4921
    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
4922

xuxzh1's avatar
update  
xuxzh1 committed
4923
4924
4925
4926
    result->op     = GGML_OP_CROSS_ENTROPY_LOSS_BACK;
    result->src[0] = a;
    result->src[1] = b;
    result->src[2] = c;
xuxzh1's avatar
init  
xuxzh1 committed
4927

xuxzh1's avatar
update  
xuxzh1 committed
4928
4929
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
4930

xuxzh1's avatar
update  
xuxzh1 committed
4931
// opt_step_adamw
xuxzh1's avatar
init  
xuxzh1 committed
4932

xuxzh1's avatar
update  
xuxzh1 committed
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
struct ggml_tensor * ggml_opt_step_adamw(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * grad,
        struct ggml_tensor  * m,
        struct ggml_tensor  * v,
        struct ggml_tensor  * adamw_params) {
    GGML_ASSERT(a->flags & GGML_TENSOR_FLAG_PARAM);
    GGML_ASSERT(ggml_are_same_shape(a, grad));
    GGML_ASSERT(ggml_are_same_shape(a, m));
    GGML_ASSERT(ggml_are_same_shape(a, v));
    GGML_ASSERT(adamw_params->type == GGML_TYPE_F32);
    GGML_ASSERT(ggml_nelements(adamw_params) == 7);
xuxzh1's avatar
init  
xuxzh1 committed
4946

xuxzh1's avatar
update  
xuxzh1 committed
4947
    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
xuxzh1's avatar
init  
xuxzh1 committed
4948

xuxzh1's avatar
update  
xuxzh1 committed
4949
4950
4951
4952
4953
4954
    result->op     = GGML_OP_OPT_STEP_ADAMW;
    result->src[0] = a;
    result->src[1] = grad;
    result->src[2] = m;
    result->src[3] = v;
    result->src[4] = adamw_params;
xuxzh1's avatar
init  
xuxzh1 committed
4955

xuxzh1's avatar
update  
xuxzh1 committed
4956
4957
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
4958

xuxzh1's avatar
update  
xuxzh1 committed
4959
////////////////////////////////////////////////////////////////////////////////
xuxzh1's avatar
init  
xuxzh1 committed
4960

xuxzh1's avatar
update  
xuxzh1 committed
4961
4962
4963
4964
4965
4966
4967
4968
struct ggml_hash_set ggml_hash_set_new(size_t size) {
    size = ggml_hash_size(size);
    struct ggml_hash_set result;
    result.size = size;
    result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
    result.used = GGML_CALLOC(ggml_bitset_size(size), sizeof(ggml_bitset_t));
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
4969

xuxzh1's avatar
update  
xuxzh1 committed
4970
4971
4972
void ggml_hash_set_reset(struct ggml_hash_set * hash_set) {
    memset(hash_set->used, 0, sizeof(ggml_bitset_t) * ggml_bitset_size(hash_set->size));
}
xuxzh1's avatar
init  
xuxzh1 committed
4973

xuxzh1's avatar
update  
xuxzh1 committed
4974
4975
4976
4977
void ggml_hash_set_free(struct ggml_hash_set * hash_set) {
    GGML_FREE(hash_set->used);
    GGML_FREE(hash_set->keys);
}
xuxzh1's avatar
init  
xuxzh1 committed
4978

xuxzh1's avatar
update  
xuxzh1 committed
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
size_t ggml_hash_size(size_t min_sz) {
    // next primes after powers of two
    static const size_t primes[] = {
        2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
        2053, 4099, 8209, 16411, 32771, 65537, 131101,
        262147, 524309, 1048583, 2097169, 4194319, 8388617,
        16777259, 33554467, 67108879, 134217757, 268435459,
        536870923, 1073741827, 2147483659
    };
    static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
xuxzh1's avatar
init  
xuxzh1 committed
4989

xuxzh1's avatar
update  
xuxzh1 committed
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
    // find the smallest prime that is larger or equal than min_sz
    size_t l = 0;
    size_t r = n_primes;
    while (l < r) {
        size_t m = (l + r)/2;
        if (primes[m] < min_sz) {
            l = m + 1;
        } else {
            r = m;
        }
    }
    size_t sz = l < n_primes ? primes[l] : min_sz | 1;
    return sz;
}
xuxzh1's avatar
init  
xuxzh1 committed
5004

xuxzh1's avatar
update  
xuxzh1 committed
5005
5006
5007
5008
struct hash_map {
    struct ggml_hash_set set;
    struct ggml_tensor ** vals;
};
xuxzh1's avatar
init  
xuxzh1 committed
5009

xuxzh1's avatar
update  
xuxzh1 committed
5010
5011
5012
5013
5014
5015
static struct hash_map * ggml_new_hash_map(size_t size) {
    struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map));
    result->set = ggml_hash_set_new(size);
    result->vals = GGML_CALLOC(result->set.size, sizeof(struct ggml_tensor *));
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
5016

xuxzh1's avatar
update  
xuxzh1 committed
5017
5018
5019
5020
5021
static void ggml_hash_map_free(struct hash_map * map) {
    ggml_hash_set_free(&map->set);
    GGML_FREE(map->vals);
    GGML_FREE(map);
}
xuxzh1's avatar
init  
xuxzh1 committed
5022

xuxzh1's avatar
update  
xuxzh1 committed
5023
5024
5025
5026
5027
5028
// utility functions to change gradients
// isrc is the index of tensor in cgraph->visited_has_set.keys
// the corresponding gradient (accumulators) are also at position isrc
// if tensor has a gradient accumulator, modify that accumulator in-place
// else if there is no gradient for tensor, set the corresponding value
// else, just add/subtract/etc. the gradients
xuxzh1's avatar
init  
xuxzh1 committed
5029

xuxzh1's avatar
update  
xuxzh1 committed
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
static void ggml_add_or_set(
        struct ggml_context * ctx,
        struct ggml_cgraph  * cgraph,
        size_t                isrc,
        struct ggml_tensor  * tensor) {
    struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
    GGML_ASSERT(src);
    if (cgraph->grads[isrc]) {
        cgraph->grads[isrc] = ggml_add_impl(ctx, cgraph->grads[isrc], tensor, /*inplace =*/ cgraph->grad_accs[isrc]);
    } else {
        cgraph->grads[isrc] = tensor;
    }
    ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
    ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
}
xuxzh1's avatar
init  
xuxzh1 committed
5045

xuxzh1's avatar
update  
xuxzh1 committed
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
static void ggml_acc_or_set(
        struct ggml_context * ctx,
        struct ggml_cgraph  * cgraph,
        size_t                isrc,
        struct ggml_tensor  * tensor,
        const  size_t         nb1,
        const  size_t         nb2,
        const  size_t         nb3,
        const  size_t         offset) {
    struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
    GGML_ASSERT(src);
    if (cgraph->grads[isrc]) {
        cgraph->grads[isrc] = ggml_acc_impl(ctx, cgraph->grads[isrc], tensor, nb1, nb2, nb3, offset, cgraph->grad_accs[isrc]);
    } else {
        struct ggml_tensor * a_zero = ggml_scale(ctx, src, 0.0f); // FIXME this is going to produce NaN if a contains inf/NaN
        cgraph->grads[isrc] = ggml_acc_impl(ctx, a_zero, tensor, nb1, nb2, nb3, offset, false);
xuxzh1's avatar
init  
xuxzh1 committed
5062
    }
xuxzh1's avatar
update  
xuxzh1 committed
5063
5064
5065
    ggml_format_name(cgraph->grads[isrc], "grad for %s", cgraph->visited_hash_set.keys[isrc]->name);
    ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
}
xuxzh1's avatar
init  
xuxzh1 committed
5066

xuxzh1's avatar
update  
xuxzh1 committed
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
static void ggml_add1_or_set(
        struct ggml_context * ctx,
        struct ggml_cgraph  * cgraph,
        size_t                isrc,
        struct ggml_tensor  * tensor) {
    struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
    GGML_ASSERT(src);
    if (cgraph->grads[isrc]) {
        cgraph->grads[isrc] = ggml_add1_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
    } else {
        cgraph->grads[isrc] = ggml_repeat(ctx, tensor, src);
    }
    ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
    ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
xuxzh1's avatar
init  
xuxzh1 committed
5081
5082
}

xuxzh1's avatar
update  
xuxzh1 committed
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
static void ggml_sub_or_set(
        struct ggml_context * ctx,
        struct ggml_cgraph  * cgraph,
        size_t                isrc,
        struct ggml_tensor  * tensor) {
    struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
    GGML_ASSERT(src);
    if (cgraph->grads[isrc]) {
        cgraph->grads[isrc] = ggml_sub_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
    } else {
        cgraph->grads[isrc] = ggml_neg(ctx, tensor);
    }
    ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
    ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
}
xuxzh1's avatar
init  
xuxzh1 committed
5098

xuxzh1's avatar
update  
xuxzh1 committed
5099
5100
5101
5102
static void ggml_compute_backward(
        struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i, bool * grads_needed) {
    struct ggml_tensor * tensor = cgraph->nodes[i];
    struct ggml_tensor * grad   = ggml_graph_get_grad(cgraph, tensor);
xuxzh1's avatar
init  
xuxzh1 committed
5103

xuxzh1's avatar
update  
xuxzh1 committed
5104
5105
    if (!grad) {
        return;
xuxzh1's avatar
init  
xuxzh1 committed
5106
5107
    }

xuxzh1's avatar
update  
xuxzh1 committed
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
    struct ggml_tensor * src0 = tensor->src[0];
    struct ggml_tensor * src1 = tensor->src[1];
    struct ggml_tensor * src2 = tensor->src[2];
    struct ggml_hash_set * hash_set = &cgraph->visited_hash_set;
    const size_t isrc0 = src0 ? ggml_hash_find(hash_set, src0) : (size_t) -1;
    const size_t isrc1 = src1 ? ggml_hash_find(hash_set, src1) : (size_t) -1;
    const size_t isrc2 = src2 ? ggml_hash_find(hash_set, src2) : (size_t) -1;
    const bool src0_needs_grads = src0 && isrc0 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0];
    const bool src1_needs_grads = src1 && isrc1 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1];
    const bool src2_needs_grads = src2 && isrc2 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2];
xuxzh1's avatar
init  
xuxzh1 committed
5118

xuxzh1's avatar
update  
xuxzh1 committed
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
    switch (tensor->op) {
        case GGML_OP_DUP: {
            if (src0_needs_grads) {
                ggml_add_or_set(ctx, cgraph, isrc0, grad);
            }
        } break;
        case GGML_OP_ADD: {
            if (src0_needs_grads) {
                ggml_add_or_set(ctx, cgraph, isrc0, grad);
            }
            if (src1_needs_grads) {
                struct ggml_tensor * tmp = grad;
                if (!ggml_are_same_shape(src0, src1)) {
                    tmp = ggml_repeat_back(ctx, tmp, src1);
                }
                ggml_add_or_set(ctx, cgraph, isrc1, tmp);
            }
        } break;
        case GGML_OP_ADD1: {
            if (src0_needs_grads) {
                ggml_add_or_set(ctx, cgraph, isrc0, grad);
            }
            if (src1_needs_grads) {
                ggml_add_or_set(ctx, cgraph, isrc1, ggml_mean(ctx, grad)); // TODO: should probably be sum instead of mean
            }
        } break;
        case GGML_OP_ACC: {
            if (src0_needs_grads) {
                ggml_add_or_set(ctx, cgraph, isrc0, grad);
            }
            if (src1_needs_grads) {
                const size_t nb1    = ((int32_t *) tensor->op_params)[0];
                const size_t nb2    = ((int32_t *) tensor->op_params)[1];
                const size_t nb3    = ((int32_t *) tensor->op_params)[2];
                const size_t offset = ((int32_t *) tensor->op_params)[3];

                struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
                    grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
                    nb1, nb2, nb3, offset);

                ggml_add_or_set(ctx, cgraph, isrc1, ggml_reshape(ctx, ggml_cont(ctx, tensor_grad_view), src1));
            }
        } break;
        case GGML_OP_SUB: {
            if (src0_needs_grads) {
                ggml_add_or_set(ctx, cgraph, isrc0, grad);
            }
            if (src1_needs_grads) {
                ggml_sub_or_set(ctx, cgraph, isrc1, grad);
            }
        } break;
        case GGML_OP_MUL: {
            if (src0_needs_grads) {
                ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, src1, grad));
            }
            if (src1_needs_grads) {
                struct ggml_tensor * tmp = ggml_mul(ctx, src0, grad);
                if (!ggml_are_same_shape(src0, src1)) {
                    tmp = ggml_repeat_back(ctx, tmp, src1);
                }
                ggml_add_or_set(ctx, cgraph, isrc1, tmp);
            }
        } break;
        case GGML_OP_DIV: {
            if (src0_needs_grads) {
                ggml_add_or_set(ctx, cgraph, isrc0, ggml_div(ctx, grad, src1));
            }
            if (src1_needs_grads) {
                ggml_sub_or_set(ctx, cgraph, isrc1, ggml_mul(ctx, grad, ggml_div(ctx, tensor, src1)));
            }
        } break;
        case GGML_OP_SQR: {
            if (src0_needs_grads) {
                ggml_add_or_set(ctx, cgraph, isrc0, ggml_scale(ctx, ggml_mul(ctx, src0, grad), 2.0f));
            }
        } break;
        case GGML_OP_SQRT: {
            if (src0_needs_grads) {
                ggml_add_or_set(ctx, cgraph, isrc0, ggml_scale(ctx, ggml_div(ctx, grad, tensor), 0.5f));
            }
        } break;
        case GGML_OP_LOG: {
            if (src0_needs_grads) {
                ggml_add_or_set(ctx, cgraph, isrc0, ggml_div(ctx, grad, src0));
            }
        } break;
        case GGML_OP_SIN: {
            if (src0_needs_grads) {
                ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, ggml_cos(ctx, src0)));
            }
        } break;
        case GGML_OP_COS: {
            if (src0_needs_grads) {
                ggml_sub_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, ggml_sin(ctx, src0)));
            }
        } break;
        case GGML_OP_SUM: {
            if (src0_needs_grads) {
                ggml_add1_or_set(ctx, cgraph, isrc0, grad);
            }
        } break;
        case GGML_OP_SUM_ROWS: {
            if (src0_needs_grads) {
                ggml_add_or_set(ctx, cgraph, isrc0, ggml_repeat(ctx, grad, src0));
            }
        } break;
        case GGML_OP_MEAN: {
            if (src0_needs_grads) {
                ggml_add1_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false));
            }
        } break;
        case GGML_OP_REPEAT: {
            if (src0_needs_grads) {
                ggml_add_or_set(ctx, cgraph, isrc0, ggml_repeat_back(ctx, grad, src0));
            }
        } break;
        case GGML_OP_REPEAT_BACK: {
            if (src0_needs_grads) {
                ggml_add_or_set(ctx, cgraph, isrc0, ggml_repeat(ctx, grad, src0));
            }
        } break;
        case GGML_OP_RMS_NORM: {
            if (src0_needs_grads) {
                float eps;
                memcpy(&eps, tensor->op_params, sizeof(float));
                ggml_add_or_set(ctx, cgraph, isrc0, ggml_rms_norm_back(ctx, src0, grad, eps));
            }
        } break;
        case GGML_OP_MUL_MAT: {
            // https://cs231n.github.io/optimization-2/#staged
            // # forward pass
            // s0 = np.random.randn(5, 10)
            // s1 = np.random.randn(10, 3)
            // t = s0.dot(s1)

            // # now suppose we had the gradient on t from above in the circuit
            // dt = np.random.randn(*t.shape) # same shape as t
            // ds0 = dt.dot(s1.T) #.T gives the transpose of the matrix
            // ds1 = t.T.dot(dt)

            // tensor.shape [m,p,qq,rr]
            // src0.shape   [n,m,q1,r1]
            // src1.shape   [n,p,qq,rr]

            if (src0_needs_grads) {
                struct ggml_tensor * s1_tg =
                    ggml_out_prod(ctx, // [n,m,qq,rr]
                        src1,          // [n,p,qq,rr]
                        grad);         // [m,p,qq,rr]
                const int64_t qq = s1_tg->ne[2];
                const int64_t rr = s1_tg->ne[3];
                const int64_t q1 = src0->ne[2];
                const int64_t r1 = src0->ne[3];
                const bool ne2_broadcasted = qq > q1;
                const bool ne3_broadcasted = rr > r1;
                if (ne2_broadcasted || ne3_broadcasted) {
                    // sum broadcast repetitions of s1_tg into shape of src0
                    s1_tg = ggml_repeat_back(ctx, s1_tg, src0);
                }
                ggml_add_or_set(ctx, cgraph, isrc0, s1_tg /*= [n,m,q1,r1]*/);
            }
            if (src1_needs_grads) {
                ggml_add_or_set(ctx, cgraph, isrc1,
                        // ggml_mul_mat(ctx,                   // [n,p,qq,rr]
                        //     ggml_cont(ctx,                  // [m,n,q1,r1]
                        //         ggml_transpose(ctx, src0)), // [m,n,q1,r1]
                        //     grad),                          // [m,p,qq,rr]

                        // when src0 is bigger than tensor->grad (this is mostly the case in llama),
                        // avoid transpose of src0, rather transpose smaller tensor->grad
                        // and then use ggml_out_prod
                        ggml_out_prod(ctx,      // [n,p,qq,rr]
                            src0,               // [n,m,q1,r1]
                            ggml_transpose(ctx, // [p,m,qq,rr]
                                grad)));        // [m,p,qq,rr]
            }
        } break;
        case GGML_OP_SCALE: {
            if (src0_needs_grads) {
                float s;
                memcpy(&s, tensor->op_params, sizeof(float));
                ggml_add_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, s, false));
            }
        } break;
        case GGML_OP_SET: {
            const size_t nb1    = ((const int32_t *) tensor->op_params)[0];
            const size_t nb2    = ((const int32_t *) tensor->op_params)[1];
            const size_t nb3    = ((const int32_t *) tensor->op_params)[2];
            const size_t offset = ((const int32_t *) tensor->op_params)[3];

            struct ggml_tensor * tensor_grad_view = NULL;

            if (src0_needs_grads || src1_needs_grads) {
                GGML_ASSERT(src0->type == tensor->type);
                GGML_ASSERT(!cgraph->grads[isrc0] ||                      cgraph->grads[isrc0]->type == grad->type);
                GGML_ASSERT(!cgraph->grads[isrc1] || !src1_needs_grads || cgraph->grads[isrc1]->type == grad->type);

                tensor_grad_view = ggml_view_4d(ctx,
                    grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
                    nb1, nb2, nb3, offset);
            }

            if (src0_needs_grads) {
                struct ggml_tensor * tmp = ggml_neg(ctx, tensor_grad_view);
                ggml_add_or_set(ctx, cgraph, isrc0, ggml_acc_impl(ctx, grad, tmp, nb1, nb2, nb3, offset, false));
            }

            if (src1_needs_grads) {
                ggml_add_or_set(ctx, cgraph, isrc1, ggml_reshape(ctx, ggml_cont(ctx, tensor_grad_view), src1));
            }
        } break;
        case GGML_OP_CPY: {
            // cpy overwrites value of src1 by src0 and returns view(src1)
            // the overwriting is mathematically equivalent to:
            // tensor = src0 * 1 + src1 * 0
            if (src0_needs_grads) {
                // dsrc0 = dtensor * 1
                ggml_add_or_set(ctx, cgraph, isrc0, grad);
            }
            if (src1_needs_grads) {
                // dsrc1 = dtensor * 0 -> noop
            }
        } break;
        case GGML_OP_CONT: {
            // same as cpy
            if (src0_needs_grads) {
                GGML_ASSERT(!cgraph->grads[isrc0] || ggml_is_contiguous(cgraph->grads[isrc0]));
                GGML_ASSERT(ggml_is_contiguous(grad));
                ggml_add_or_set(ctx, cgraph, isrc0, grad);
            }
        } break;
        case GGML_OP_RESHAPE: {
            if (src0_needs_grads) {
                struct ggml_tensor * grad_cont = ggml_is_contiguous(grad) ? grad : ggml_cont(ctx, grad);
                ggml_add_or_set(ctx, cgraph, isrc0, ggml_reshape(ctx, grad_cont, src0));
            }
        } break;
        case GGML_OP_VIEW: {
            if (src0_needs_grads) {
                size_t offset;

                memcpy(&offset, tensor->op_params, sizeof(offset));

                size_t nb1 = tensor->nb[1];
                size_t nb2 = tensor->nb[2];
                size_t nb3 = tensor->nb[3];

                if (cgraph->grads[isrc0] && src0->type != cgraph->grads[isrc0]->type) {
                    // gradient is typically F32, but src0 could be other type
                    size_t ng = ggml_element_size(cgraph->grads[isrc0]);
                    size_t n0 = ggml_element_size(src0);
                    GGML_ASSERT(offset % n0 == 0);
                    GGML_ASSERT(nb1 % n0 == 0);
                    GGML_ASSERT(nb2 % n0 == 0);
                    GGML_ASSERT(nb3 % n0 == 0);
                    offset = (offset / n0) * ng;
                    nb1 = (nb1 / n0) * ng;
                    nb2 = (nb2 / n0) * ng;
                    nb3 = (nb3 / n0) * ng;
                }

                ggml_acc_or_set(ctx, cgraph, isrc0, grad, nb1, nb2, nb3, offset);
            }
        } break;
        case GGML_OP_PERMUTE: {
            if (src0_needs_grads) {
                const int32_t * axes = (const int32_t *) tensor->op_params;
                const int axis0 = axes[0] & 0x3;
                const int axis1 = axes[1] & 0x3;
                const int axis2 = axes[2] & 0x3;
                const int axis3 = axes[3] & 0x3;
                int axb[4] = {0,0,0,0}; // axes backward
                axb[axis0] = 0;
                axb[axis1] = 1;
                axb[axis2] = 2;
                axb[axis3] = 3;
                ggml_add_or_set(ctx, cgraph, isrc0, ggml_permute(ctx, grad, axb[0], axb[1], axb[2], axb[3]));
            }
        } break;
        case GGML_OP_TRANSPOSE: {
            if (src0_needs_grads) {
                ggml_add_or_set(ctx, cgraph, isrc0, ggml_transpose(ctx, grad));
            }
        } break;
        case GGML_OP_GET_ROWS: {
            if (src0_needs_grads) {
                ggml_add_or_set(ctx, cgraph, isrc0, ggml_get_rows_back(ctx, grad, src1, src0));
            }
            if (src1_needs_grads) {
                // noop
            }
        } break;
        case GGML_OP_DIAG_MASK_INF: {
            if (src0_needs_grads) {
                /* ggml_diag_mask_inf_impl() shouldn't be here */
                /* ref:  https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
                const int n_past = ((const int32_t *) tensor->op_params)[0];
                ggml_add_or_set(ctx, cgraph, isrc0, ggml_diag_mask_zero_impl(ctx, grad, n_past, false));
            }
        } break;
        case GGML_OP_DIAG_MASK_ZERO: {
            if (src0_needs_grads) {
                const int n_past = ((const int32_t *) tensor->op_params)[0];
                ggml_add_or_set(ctx, cgraph, isrc0, ggml_diag_mask_zero_impl(ctx, grad, n_past, false));
            }
        } break;
        case GGML_OP_SOFT_MAX: {
            if (src0_needs_grads) {
                ggml_add_or_set(ctx, cgraph, isrc0, ggml_soft_max_back(ctx, grad, tensor));
            }
            GGML_ASSERT((!src1 || !src1_needs_grads) && "backward pass for softmax mask not implemented");
        } break;
        case GGML_OP_ROPE: {
            if (src0_needs_grads) {
                //const int n_past = ((int32_t *) tensor->op_params)[0];
                const int n_dims     = ((const int32_t *) tensor->op_params)[1];
                const int mode       = ((const int32_t *) tensor->op_params)[2];
                //const int n_ctx      = ((int32_t *) tensor->op_params)[3];
                const int n_ctx_orig = ((const int32_t *) tensor->op_params)[4];
                float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;

                memcpy(&freq_base,   (const float *) tensor->op_params +  5, sizeof(float));
                memcpy(&freq_scale,  (const float *) tensor->op_params +  6, sizeof(float));
                memcpy(&ext_factor,  (const float *) tensor->op_params +  7, sizeof(float));
                memcpy(&attn_factor, (const float *) tensor->op_params +  8, sizeof(float));
                memcpy(&beta_fast,   (const float *) tensor->op_params +  9, sizeof(float));
                memcpy(&beta_slow,   (const float *) tensor->op_params + 10, sizeof(float));

                ggml_add_or_set(ctx, cgraph, isrc0,
                    ggml_rope_back(ctx, grad, src1, src2, n_dims, mode, n_ctx_orig, freq_base,
                        freq_scale, ext_factor, attn_factor, beta_fast, beta_slow));
            }
            GGML_ASSERT((!src2 || !src2_needs_grads) && "gradients for freq factors not implemented");
        } break;
        case GGML_OP_IM2COL: {
            if (src1_needs_grads) {
                const int32_t s0    = ggml_get_op_params_i32(tensor, 0);
                const int32_t s1    = ggml_get_op_params_i32(tensor, 1);
                const int32_t p0    = ggml_get_op_params_i32(tensor, 2);
                const int32_t p1    = ggml_get_op_params_i32(tensor, 3);
                const int32_t d0    = ggml_get_op_params_i32(tensor, 4);
                const int32_t d1    = ggml_get_op_params_i32(tensor, 5);
                const bool    is_2D = ggml_get_op_params_i32(tensor, 6) == 1;

                ggml_add_or_set(ctx, cgraph, isrc1, ggml_im2col_back(ctx, src0, grad, src1->ne, s0, s1, p0, p1, d0, d1, is_2D));
            }
        } break;
        case GGML_OP_POOL_2D: {
            if (src0_needs_grads) {
                const enum ggml_op_pool op = ggml_get_op_params_i32(tensor, 0);
                const      int32_t      k0 = ggml_get_op_params_i32(tensor, 1);
                const      int32_t      k1 = ggml_get_op_params_i32(tensor, 2);
                const      int32_t      s0 = ggml_get_op_params_i32(tensor, 3);
                const      int32_t      s1 = ggml_get_op_params_i32(tensor, 4);
                const      int32_t      p0 = ggml_get_op_params_i32(tensor, 5);
                const      int32_t      p1 = ggml_get_op_params_i32(tensor, 6);

                ggml_add_or_set(ctx, cgraph, isrc0, ggml_pool_2d_back(ctx, grad, src0, op, k0, k1, s0, s1, p0, p1));
            }
        } break;
        case GGML_OP_WIN_PART:
        case GGML_OP_WIN_UNPART:
        case GGML_OP_UNARY: {
            switch (ggml_get_unary_op(tensor)) {
                case GGML_UNARY_OP_ABS: {
                    if (src0_needs_grads) {
                        ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, ggml_sgn(ctx, src0), grad));
                    }
                } break;
                case GGML_UNARY_OP_SGN: {
                    // noop
                } break;
                case GGML_UNARY_OP_NEG: {
                    if (src0_needs_grads) {
                        ggml_sub_or_set(ctx, cgraph, isrc0, grad);
                    }
                } break;
                case GGML_UNARY_OP_STEP: {
                    // noop
                } break;
                case GGML_UNARY_OP_RELU: {
                    if (src0_needs_grads) {
                        ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, ggml_step(ctx, src0), grad));
                    }
                } break;
                case GGML_UNARY_OP_SILU: {
                    if (src0_needs_grads) {
                        ggml_add_or_set(ctx, cgraph, isrc0, ggml_silu_back(ctx, src0, grad));
                    }
                } break;
                case GGML_UNARY_OP_EXP: {
                    if (src0_needs_grads) {
                        ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, tensor, grad));
                    }
                } break;
                default: {
                    fprintf(stderr, "%s: unsupported unary op for backward pass: %s\n",
                        __func__, ggml_unary_op_name(ggml_get_unary_op(tensor)));
                    GGML_ABORT("fatal error");
                } //break;
            }
        } break;
        case GGML_OP_CROSS_ENTROPY_LOSS: {
            if (src0_needs_grads) {
                ggml_add_or_set(ctx, cgraph, isrc0, ggml_cross_entropy_loss_back(ctx, src0, src1, grad));
            }
            GGML_ASSERT(!src1_needs_grads && "backward pass for labels not implemented");
        } break;
        case GGML_OP_NONE: {
            // noop
        } break;
        case GGML_OP_COUNT:
        default: {
            fprintf(stderr, "%s: unsupported ggml op for backward pass: %s\n", __func__, ggml_op_name(tensor->op));
            GGML_ABORT("fatal error");
        } //break;
xuxzh1's avatar
init  
xuxzh1 committed
5535
5536
    }

xuxzh1's avatar
update  
xuxzh1 committed
5537
5538
5539
    GGML_ASSERT(!src0_needs_grads || ggml_are_same_shape(src0, cgraph->grads[isrc0]));
    GGML_ASSERT(!src1_needs_grads || ggml_are_same_shape(src1, cgraph->grads[isrc1]));
    GGML_ASSERT(!src2_needs_grads || ggml_are_same_shape(src2, cgraph->grads[isrc2]));
xuxzh1's avatar
init  
xuxzh1 committed
5540
5541
}

xuxzh1's avatar
update  
xuxzh1 committed
5542
5543
5544
5545
static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
    // check if already visited
    if (ggml_hash_insert(&cgraph->visited_hash_set, node) == GGML_HASHSET_ALREADY_EXISTS) {
        return;
xuxzh1's avatar
init  
xuxzh1 committed
5546
5547
    }

xuxzh1's avatar
update  
xuxzh1 committed
5548
5549
5550
5551
5552
5553
5554
    for (int i = 0; i < GGML_MAX_SRC; ++i) {
        const int k =
            (cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i :
            (cgraph->order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? (GGML_MAX_SRC-1-i) :
            /* unknown order, just fall back to using i*/ i;
        if (node->src[k]) {
            ggml_visit_parents(cgraph, node->src[k]);
xuxzh1's avatar
init  
xuxzh1 committed
5555
5556
5557
        }
    }

xuxzh1's avatar
update  
xuxzh1 committed
5558
5559
5560
    if (node->op == GGML_OP_NONE && !(node->flags & GGML_TENSOR_FLAG_PARAM)) {
        // reached a leaf node, not part of the gradient graph (e.g. a constant)
        GGML_ASSERT(cgraph->n_leafs < cgraph->size);
xuxzh1's avatar
init  
xuxzh1 committed
5561

xuxzh1's avatar
update  
xuxzh1 committed
5562
5563
        if (strlen(node->name) == 0) {
            ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
xuxzh1's avatar
init  
xuxzh1 committed
5564
5565
        }

xuxzh1's avatar
update  
xuxzh1 committed
5566
5567
5568
5569
        cgraph->leafs[cgraph->n_leafs] = node;
        cgraph->n_leafs++;
    } else {
        GGML_ASSERT(cgraph->n_nodes < cgraph->size);
xuxzh1's avatar
init  
xuxzh1 committed
5570

xuxzh1's avatar
update  
xuxzh1 committed
5571
5572
5573
        if (strlen(node->name) == 0) {
            ggml_format_name(node, "node_%d", cgraph->n_nodes);
        }
xuxzh1's avatar
init  
xuxzh1 committed
5574

xuxzh1's avatar
update  
xuxzh1 committed
5575
5576
5577
        cgraph->nodes[cgraph->n_nodes] = node;
        cgraph->n_nodes++;
    }
xuxzh1's avatar
init  
xuxzh1 committed
5578
5579
}

xuxzh1's avatar
update  
xuxzh1 committed
5580
5581
5582
5583
5584
static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
    if (!expand) {
        // TODO: this branch isn't accessible anymore, maybe move this to ggml_build_forward_expand
        ggml_graph_clear(cgraph);
    }
xuxzh1's avatar
init  
xuxzh1 committed
5585

xuxzh1's avatar
update  
xuxzh1 committed
5586
    const int n0 = cgraph->n_nodes;
xuxzh1's avatar
init  
xuxzh1 committed
5587

xuxzh1's avatar
update  
xuxzh1 committed
5588
    ggml_visit_parents(cgraph, tensor);
xuxzh1's avatar
init  
xuxzh1 committed
5589

xuxzh1's avatar
update  
xuxzh1 committed
5590
5591
    const int n_new = cgraph->n_nodes - n0;
    GGML_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new);
xuxzh1's avatar
init  
xuxzh1 committed
5592

xuxzh1's avatar
update  
xuxzh1 committed
5593
5594
5595
5596
5597
    if (n_new > 0) {
        // the last added node should always be starting point
        GGML_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor);
    }
}
xuxzh1's avatar
init  
xuxzh1 committed
5598

xuxzh1's avatar
update  
xuxzh1 committed
5599
5600
5601
void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
    ggml_build_forward_impl(cgraph, tensor, true);
}
xuxzh1's avatar
init  
xuxzh1 committed
5602

xuxzh1's avatar
update  
xuxzh1 committed
5603
5604
5605
5606
5607
5608
5609
5610
void ggml_build_backward_expand(
        struct ggml_context * ctx_static,
        struct ggml_context * ctx_compute,
        struct ggml_cgraph  * cgraph,
        bool                  accumulate) {
    GGML_ASSERT(cgraph->n_nodes > 0);
    GGML_ASSERT(cgraph->grads);
    GGML_ASSERT(cgraph->grad_accs);
xuxzh1's avatar
init  
xuxzh1 committed
5611

xuxzh1's avatar
update  
xuxzh1 committed
5612
    const int n_nodes_f = cgraph->n_nodes;
xuxzh1's avatar
init  
xuxzh1 committed
5613

xuxzh1's avatar
update  
xuxzh1 committed
5614
5615
5616
    memset(cgraph->grads,     0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *));
    memset(cgraph->grad_accs, 0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *));
    bool * grads_needed = calloc(cgraph->visited_hash_set.size, sizeof(bool));
xuxzh1's avatar
init  
xuxzh1 committed
5617

xuxzh1's avatar
update  
xuxzh1 committed
5618
5619
5620
5621
5622
5623
5624
    {
        bool any_params = false;
        bool any_loss   = false;
        for (int i = 0; i < n_nodes_f; ++i) {
            struct ggml_tensor * node = cgraph->nodes[i];
            any_params = any_params || (node->flags & GGML_TENSOR_FLAG_PARAM);
            any_loss   = any_loss   || (node->flags & GGML_TENSOR_FLAG_LOSS);
xuxzh1's avatar
init  
xuxzh1 committed
5625
        }
xuxzh1's avatar
update  
xuxzh1 committed
5626
5627
        GGML_ASSERT(any_params && "no trainable parameters found, did you forget to call ggml_set_param?");
        GGML_ASSERT(any_loss && "no training loss found, did you forget to call ggml_set_loss?");
xuxzh1's avatar
init  
xuxzh1 committed
5628
5629
    }

xuxzh1's avatar
update  
xuxzh1 committed
5630
5631
    for (int i = 0; i < n_nodes_f; ++i) {
        struct ggml_tensor * node = cgraph->nodes[i];
xuxzh1's avatar
init  
xuxzh1 committed
5632

xuxzh1's avatar
update  
xuxzh1 committed
5633
5634
        if (node->type == GGML_TYPE_I32) {
            continue;
xuxzh1's avatar
init  
xuxzh1 committed
5635
5636
        }

xuxzh1's avatar
update  
xuxzh1 committed
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
        bool node_needs_grad = (node->flags & GGML_TENSOR_FLAG_PARAM) || (node->flags & GGML_TENSOR_FLAG_LOSS);
        bool ignore_src[GGML_MAX_SRC] = {false};
        switch (node->op) {
            // gradients in node->src[0] for one reason or another have no effect on output gradients
            case GGML_OP_IM2COL:      // only used for its shape
            case GGML_OP_IM2COL_BACK: // same as IM2COL
                ignore_src[0] = true;
                break;
            case GGML_OP_UNARY: {
                const enum ggml_unary_op uop = ggml_get_unary_op(node);
                // SGN and STEP unary ops are piecewise constant
                if (uop == GGML_UNARY_OP_SGN || uop == GGML_UNARY_OP_STEP) {
                    ignore_src[0] = true;
xuxzh1's avatar
init  
xuxzh1 committed
5650
                }
xuxzh1's avatar
update  
xuxzh1 committed
5651
            } break;
xuxzh1's avatar
init  
xuxzh1 committed
5652

xuxzh1's avatar
update  
xuxzh1 committed
5653
5654
5655
5656
5657
5658
5659
            // gradients in node->src[1] for one reason or another have no effect on output gradients
            case GGML_OP_CPY:           // gradients in CPY target are irrelevant
            case GGML_OP_GET_ROWS:      // row indices not differentiable
            case GGML_OP_GET_ROWS_BACK: // same as for GET_ROWS
            case GGML_OP_ROPE:          // positions not differentiable
                ignore_src[1] = true;
                break;
xuxzh1's avatar
init  
xuxzh1 committed
5660

xuxzh1's avatar
update  
xuxzh1 committed
5661
5662
5663
5664
5665
5666
            default:
                break;
        }
        for (int j = 0; j < GGML_MAX_SRC; ++j) {
            if (!node->src[j] || ignore_src[j] || !grads_needed[ggml_hash_find(&cgraph->visited_hash_set, node->src[j])]) {
                continue;
xuxzh1's avatar
init  
xuxzh1 committed
5667
            }
xuxzh1's avatar
update  
xuxzh1 committed
5668
5669
5670
5671
5672
5673
            GGML_ASSERT(node->src[j]->type == GGML_TYPE_F32 || node->src[j]->type == GGML_TYPE_F16);
            node_needs_grad = true;
            break;
        }
        if (!node_needs_grad) {
            continue;
xuxzh1's avatar
init  
xuxzh1 committed
5674
5675
        }

xuxzh1's avatar
update  
xuxzh1 committed
5676
5677
5678
        // inplace operations are currently not supported
        GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW ||
            node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
xuxzh1's avatar
init  
xuxzh1 committed
5679

xuxzh1's avatar
update  
xuxzh1 committed
5680
5681
5682
5683
5684
5685
5686
        const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
        GGML_ASSERT(igrad != GGML_HASHSET_FULL);
        GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, igrad));
        if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) {
            cgraph->grad_accs[igrad] = ggml_dup_tensor(ctx_static, node);
            cgraph->grads[igrad]     = cgraph->grad_accs[igrad];
            ggml_format_name(cgraph->grad_accs[igrad], "grad acc for %s", node->name);
xuxzh1's avatar
init  
xuxzh1 committed
5687
        }
xuxzh1's avatar
update  
xuxzh1 committed
5688
        grads_needed[igrad] = true;
xuxzh1's avatar
init  
xuxzh1 committed
5689
5690
    }

xuxzh1's avatar
update  
xuxzh1 committed
5691
5692
5693
5694
5695
    for (int i = n_nodes_f - 1; i >= 0; --i) {
        // inplace operations to add gradients are not created by ggml_compute_backward except for gradient accumulation
        // use allocator to automatically make inplace operations
        ggml_compute_backward(ctx_compute, cgraph, i, grads_needed);
    }
xuxzh1's avatar
init  
xuxzh1 committed
5696

xuxzh1's avatar
update  
xuxzh1 committed
5697
    free(grads_needed);
xuxzh1's avatar
init  
xuxzh1 committed
5698
5699
}

xuxzh1's avatar
update  
xuxzh1 committed
5700
5701
5702
5703
5704
static void * incr_ptr_aligned(void ** p, size_t size, size_t align) {
    void * ptr = *p;
    ptr = (void *) GGML_PAD((uintptr_t) ptr, align);
    *p = (void *) ((char *) ptr + size);
    return ptr;
xuxzh1's avatar
init  
xuxzh1 committed
5705
5706
}

xuxzh1's avatar
update  
xuxzh1 committed
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
static size_t ggml_graph_nbytes(size_t size, bool grads) {
    size_t hash_size = ggml_hash_size(size * 2);
    void * p = 0;
    incr_ptr_aligned(&p, sizeof(struct ggml_cgraph), 1);
    incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // nodes
    incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // leafs
    incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // hash keys
    if (grads) {
        incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // grads
        incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // grad_accs
xuxzh1's avatar
init  
xuxzh1 committed
5717
    }
xuxzh1's avatar
update  
xuxzh1 committed
5718
5719
5720
5721
    incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t));

    size_t nbytes = (size_t) p;
    return nbytes;
xuxzh1's avatar
init  
xuxzh1 committed
5722
5723
}

xuxzh1's avatar
update  
xuxzh1 committed
5724
5725
size_t ggml_graph_overhead_custom(size_t size, bool grads) {
    return GGML_OBJECT_SIZE + GGML_PAD(ggml_graph_nbytes(size, grads), GGML_MEM_ALIGN);
xuxzh1's avatar
init  
xuxzh1 committed
5726
5727
}

xuxzh1's avatar
update  
xuxzh1 committed
5728
5729
size_t ggml_graph_overhead(void) {
    return ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, false);
xuxzh1's avatar
init  
xuxzh1 committed
5730
5731
}

xuxzh1's avatar
update  
xuxzh1 committed
5732
5733
5734
5735
struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
    const size_t obj_size = ggml_graph_nbytes(size, grads);
    struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_GRAPH, obj_size);
    struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
xuxzh1's avatar
init  
xuxzh1 committed
5736

xuxzh1's avatar
update  
xuxzh1 committed
5737
5738
    // the size of the hash table is doubled since it needs to hold both nodes and leafs
    size_t hash_size = ggml_hash_size(size * 2);
xuxzh1's avatar
init  
xuxzh1 committed
5739

xuxzh1's avatar
update  
xuxzh1 committed
5740
    void * p = cgraph + 1;
xuxzh1's avatar
init  
xuxzh1 committed
5741

xuxzh1's avatar
update  
xuxzh1 committed
5742
5743
5744
5745
5746
    struct ggml_tensor ** nodes_ptr     =         incr_ptr_aligned(&p, size      * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
    struct ggml_tensor ** leafs_ptr     =         incr_ptr_aligned(&p, size      * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
    struct ggml_tensor ** hash_keys_ptr =         incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
    struct ggml_tensor ** grads_ptr     = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
    struct ggml_tensor ** grad_accs_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
xuxzh1's avatar
init  
xuxzh1 committed
5747

xuxzh1's avatar
update  
xuxzh1 committed
5748
5749
5750
5751
    ggml_bitset_t * hash_used = incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t));

    // check that we allocated the correct amount of memory
    assert(obj_size == (size_t)((char *)p - (char *)cgraph));
xuxzh1's avatar
init  
xuxzh1 committed
5752

xuxzh1's avatar
update  
xuxzh1 committed
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
    *cgraph = (struct ggml_cgraph) {
        /*.size         =*/ size,
        /*.n_nodes      =*/ 0,
        /*.n_leafs      =*/ 0,
        /*.nodes        =*/ nodes_ptr,
        /*.grads        =*/ grads_ptr,
        /*.grad_accs    =*/ grad_accs_ptr,
        /*.leafs        =*/ leafs_ptr,
        /*.hash_table   =*/ { hash_size, hash_used, hash_keys_ptr },
        /*.order        =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
    };
xuxzh1's avatar
init  
xuxzh1 committed
5764

xuxzh1's avatar
update  
xuxzh1 committed
5765
5766
5767
5768
    ggml_hash_set_reset(&cgraph->visited_hash_set);
    if (grads) {
        memset(cgraph->grads,     0, hash_size*sizeof(struct ggml_tensor *));
        memset(cgraph->grad_accs, 0, hash_size*sizeof(struct ggml_tensor *));
xuxzh1's avatar
init  
xuxzh1 committed
5769
5770
    }

xuxzh1's avatar
update  
xuxzh1 committed
5771
5772
    return cgraph;
}
xuxzh1's avatar
init  
xuxzh1 committed
5773

xuxzh1's avatar
update  
xuxzh1 committed
5774
5775
5776
struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
    return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
}
xuxzh1's avatar
init  
xuxzh1 committed
5777

xuxzh1's avatar
update  
xuxzh1 committed
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
    struct ggml_cgraph cgraph = {
        /*.size             =*/ 0,
        /*.n_nodes          =*/ i1 - i0,
        /*.n_leafs          =*/ 0,
        /*.nodes            =*/ cgraph0->nodes + i0,
        /*.grads            =*/ NULL, // gradients would need visited_hash_set
        /*.grad_accs        =*/ NULL,
        /*.leafs            =*/ NULL,
        /*.visited_hash_set =*/ { 0, NULL, NULL },
        /*.order            =*/ cgraph0->order,
    };
xuxzh1's avatar
init  
xuxzh1 committed
5790

xuxzh1's avatar
update  
xuxzh1 committed
5791
5792
    return cgraph;
}
xuxzh1's avatar
init  
xuxzh1 committed
5793

xuxzh1's avatar
update  
xuxzh1 committed
5794
5795
5796
5797
void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
    GGML_ASSERT(dst->size >= src->n_leafs);
    GGML_ASSERT(dst->size >= src->n_nodes);
    GGML_ASSERT(dst->visited_hash_set.size >= src->visited_hash_set.size);
xuxzh1's avatar
init  
xuxzh1 committed
5798

xuxzh1's avatar
update  
xuxzh1 committed
5799
5800
5801
    dst->n_leafs = src->n_leafs;
    dst->n_nodes = src->n_nodes;
    dst->order   = src->order;
xuxzh1's avatar
init  
xuxzh1 committed
5802

xuxzh1's avatar
update  
xuxzh1 committed
5803
5804
5805
    for (int i = 0; i < src->n_leafs; ++i) {
        dst->leafs[i] = src->leafs[i];
    }
xuxzh1's avatar
init  
xuxzh1 committed
5806

xuxzh1's avatar
update  
xuxzh1 committed
5807
5808
5809
    for (int i = 0; i < src->n_nodes; ++i) {
        dst->nodes[i] = src->nodes[i];
    }
xuxzh1's avatar
init  
xuxzh1 committed
5810

xuxzh1's avatar
update  
xuxzh1 committed
5811
5812
5813
5814
    for (size_t i = 0; i < src->visited_hash_set.size; ++i) {
        // copy all hashset keys (tensors) that are in use
        if (ggml_bitset_get(src->visited_hash_set.used, i)) {
            ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
xuxzh1's avatar
init  
xuxzh1 committed
5815
        }
xuxzh1's avatar
update  
xuxzh1 committed
5816
    }
xuxzh1's avatar
init  
xuxzh1 committed
5817

xuxzh1's avatar
update  
xuxzh1 committed
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
    if (dst->grads) {
        memset(dst->grads,     0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *));
        memset(dst->grad_accs, 0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *));
    }
    if (src->grads) {
        GGML_ASSERT(dst->grads     != NULL);
        GGML_ASSERT(dst->grad_accs != NULL);
        for (int i = 0; i < src->n_nodes; ++i) {
            const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
            const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);
xuxzh1's avatar
init  
xuxzh1 committed
5828

xuxzh1's avatar
update  
xuxzh1 committed
5829
5830
5831
5832
            GGML_ASSERT(igrad_src != GGML_HASHSET_FULL);
            GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src));
            GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL);
            GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst));
xuxzh1's avatar
init  
xuxzh1 committed
5833

xuxzh1's avatar
update  
xuxzh1 committed
5834
5835
            dst->grads[igrad_dst]     = src->grads[igrad_src];
            dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
xuxzh1's avatar
init  
xuxzh1 committed
5836
5837
5838
5839
        }
    }
}

xuxzh1's avatar
update  
xuxzh1 committed
5840
5841
5842
5843
5844
struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
    struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
    ggml_graph_cpy(cgraph, result);
    return result;
}
xuxzh1's avatar
init  
xuxzh1 committed
5845

xuxzh1's avatar
update  
xuxzh1 committed
5846
5847
5848
struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
    if (ggml_is_empty(tensor)) {
        return tensor;
xuxzh1's avatar
init  
xuxzh1 committed
5849
    }
xuxzh1's avatar
update  
xuxzh1 committed
5850
5851
5852
5853
5854
    if (tensor->buffer) {
        ggml_backend_tensor_memset(tensor, 0, 0, ggml_nbytes(tensor));
    } else {
        GGML_ASSERT(tensor->data);
        memset(tensor->data, 0, ggml_nbytes(tensor));
xuxzh1's avatar
init  
xuxzh1 committed
5855
    }
xuxzh1's avatar
update  
xuxzh1 committed
5856
5857
    return tensor;
}
xuxzh1's avatar
init  
xuxzh1 committed
5858

xuxzh1's avatar
update  
xuxzh1 committed
5859
5860
void ggml_graph_reset(struct ggml_cgraph * cgraph) {
    GGML_ASSERT(cgraph->grads != NULL);
xuxzh1's avatar
init  
xuxzh1 committed
5861

xuxzh1's avatar
update  
xuxzh1 committed
5862
5863
5864
    for (int i = 0; i < cgraph->n_nodes; i++) {
        struct ggml_tensor * node     = cgraph->nodes[i];
        struct ggml_tensor * grad_acc = ggml_graph_get_grad_acc(cgraph, node);
xuxzh1's avatar
init  
xuxzh1 committed
5865

xuxzh1's avatar
update  
xuxzh1 committed
5866
5867
5868
5869
5870
        if (node->op == GGML_OP_OPT_STEP_ADAMW) {
            // clear momenta
            ggml_set_zero(node->src[2]);
            ggml_set_zero(node->src[3]);
        }
xuxzh1's avatar
init  
xuxzh1 committed
5871

xuxzh1's avatar
update  
xuxzh1 committed
5872
5873
5874
5875
5876
        // initial gradients of loss should be 1, 0 otherwise
        if (grad_acc) {
            if (node->flags & GGML_TENSOR_FLAG_LOSS) {
                GGML_ASSERT(grad_acc->type == GGML_TYPE_F32);
                GGML_ASSERT(ggml_is_scalar(grad_acc));
xuxzh1's avatar
init  
xuxzh1 committed
5877

xuxzh1's avatar
update  
xuxzh1 committed
5878
5879
5880
                const float onef = 1.0f;
                if (grad_acc->buffer) {
                    ggml_backend_tensor_set(grad_acc, &onef, 0, sizeof(float));
xuxzh1's avatar
init  
xuxzh1 committed
5881
                } else {
xuxzh1's avatar
update  
xuxzh1 committed
5882
5883
                    GGML_ASSERT(grad_acc->data);
                    *((float *) grad_acc->data) = onef;
xuxzh1's avatar
init  
xuxzh1 committed
5884
                }
xuxzh1's avatar
update  
xuxzh1 committed
5885
5886
            } else {
                ggml_set_zero(grad_acc);
xuxzh1's avatar
init  
xuxzh1 committed
5887
5888
5889
            }
        }
    }
xuxzh1's avatar
update  
xuxzh1 committed
5890
}
xuxzh1's avatar
init  
xuxzh1 committed
5891

xuxzh1's avatar
update  
xuxzh1 committed
5892
5893
5894
5895
5896
void ggml_graph_clear(struct ggml_cgraph * cgraph) {
    cgraph->n_leafs = 0;
    cgraph->n_nodes = 0;
    ggml_hash_set_reset(&cgraph->visited_hash_set);
}
xuxzh1's avatar
init  
xuxzh1 committed
5897

xuxzh1's avatar
update  
xuxzh1 committed
5898
5899
int ggml_graph_size(struct ggml_cgraph * cgraph) {
    return cgraph->size;
xuxzh1's avatar
init  
xuxzh1 committed
5900
5901
}

xuxzh1's avatar
update  
xuxzh1 committed
5902
5903
5904
5905
struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) {
    if (i < 0) {
        GGML_ASSERT(cgraph->n_nodes + i >= 0);
        return cgraph->nodes[cgraph->n_nodes + i];
xuxzh1's avatar
init  
xuxzh1 committed
5906
5907
    }

xuxzh1's avatar
update  
xuxzh1 committed
5908
5909
5910
5911
5912
5913
5914
    GGML_ASSERT(i < cgraph->n_nodes);
    return cgraph->nodes[i];
}

struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) {
    return cgraph->nodes;
}
xuxzh1's avatar
init  
xuxzh1 committed
5915

xuxzh1's avatar
update  
xuxzh1 committed
5916
5917
5918
int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) {
    return cgraph->n_nodes;
}
xuxzh1's avatar
init  
xuxzh1 committed
5919

xuxzh1's avatar
update  
xuxzh1 committed
5920
5921
5922
5923
5924
void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
    GGML_ASSERT(cgraph->size > cgraph->n_nodes);
    cgraph->nodes[cgraph->n_nodes] = tensor;
    cgraph->n_nodes++;
}
xuxzh1's avatar
init  
xuxzh1 committed
5925

xuxzh1's avatar
update  
xuxzh1 committed
5926
5927
5928
struct ggml_tensor * ggml_graph_get_tensor(const struct ggml_cgraph * cgraph, const char * name) {
    for (int i = 0; i < cgraph->n_leafs; i++) {
        struct ggml_tensor * leaf = cgraph->leafs[i];
xuxzh1's avatar
init  
xuxzh1 committed
5929

xuxzh1's avatar
update  
xuxzh1 committed
5930
5931
        if (strcmp(leaf->name, name) == 0) {
            return leaf;
xuxzh1's avatar
init  
xuxzh1 committed
5932
5933
5934
        }
    }

xuxzh1's avatar
update  
xuxzh1 committed
5935
5936
    for (int i = 0; i < cgraph->n_nodes; i++) {
        struct ggml_tensor * node = cgraph->nodes[i];
xuxzh1's avatar
init  
xuxzh1 committed
5937

xuxzh1's avatar
update  
xuxzh1 committed
5938
5939
5940
5941
        if (strcmp(node->name, name) == 0) {
            return node;
        }
    }
xuxzh1's avatar
init  
xuxzh1 committed
5942

xuxzh1's avatar
update  
xuxzh1 committed
5943
5944
    return NULL;
}
xuxzh1's avatar
init  
xuxzh1 committed
5945

xuxzh1's avatar
update  
xuxzh1 committed
5946
5947
5948
5949
struct ggml_tensor * ggml_graph_get_grad(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
    const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
    return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grads[igrad] : NULL;
}
xuxzh1's avatar
init  
xuxzh1 committed
5950

xuxzh1's avatar
update  
xuxzh1 committed
5951
5952
5953
5954
struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
    const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
    return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grad_accs[igrad] : NULL;
}
xuxzh1's avatar
init  
xuxzh1 committed
5955

xuxzh1's avatar
update  
xuxzh1 committed
5956
5957
void ggml_graph_print(const struct ggml_cgraph * cgraph) {
    GGML_LOG_INFO("=== GRAPH ===\n");
xuxzh1's avatar
init  
xuxzh1 committed
5958

xuxzh1's avatar
update  
xuxzh1 committed
5959
5960
5961
    GGML_LOG_INFO("n_nodes = %d\n", cgraph->n_nodes);
    for (int i = 0; i < cgraph->n_nodes; i++) {
        struct ggml_tensor * node = cgraph->nodes[i];
xuxzh1's avatar
init  
xuxzh1 committed
5962

xuxzh1's avatar
update  
xuxzh1 committed
5963
5964
5965
5966
5967
        GGML_LOG_INFO(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s\n",
                i,
                node->ne[0], node->ne[1], node->ne[2],
                ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" :
                      ggml_graph_get_grad(cgraph, node) ? "g" : " ");
xuxzh1's avatar
init  
xuxzh1 committed
5968
5969
    }

xuxzh1's avatar
update  
xuxzh1 committed
5970
5971
5972
    GGML_LOG_INFO("n_leafs = %d\n", cgraph->n_leafs);
    for (int i = 0; i < cgraph->n_leafs; i++) {
        struct ggml_tensor * node = cgraph->leafs[i];
xuxzh1's avatar
init  
xuxzh1 committed
5973

xuxzh1's avatar
update  
xuxzh1 committed
5974
5975
5976
5977
5978
        GGML_LOG_INFO(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
                i,
                node->ne[0], node->ne[1],
                ggml_op_name(node->op),
                ggml_get_name(node));
xuxzh1's avatar
init  
xuxzh1 committed
5979
5980
    }

xuxzh1's avatar
update  
xuxzh1 committed
5981
5982
5983
5984
5985
5986
5987
    GGML_LOG_INFO("========================================\n");
}

// check if node is part of the graph
static bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
    if (cgraph == NULL) {
        return true;
xuxzh1's avatar
init  
xuxzh1 committed
5988
5989
    }

xuxzh1's avatar
update  
xuxzh1 committed
5990
5991
5992
    for (int i = 0; i < cgraph->n_nodes; i++) {
        if (cgraph->nodes[i] == node) {
            return true;
xuxzh1's avatar
init  
xuxzh1 committed
5993
5994
5995
        }
    }

xuxzh1's avatar
update  
xuxzh1 committed
5996
5997
    return false;
}
xuxzh1's avatar
init  
xuxzh1 committed
5998

xuxzh1's avatar
update  
xuxzh1 committed
5999
6000
6001
6002
static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
    for (int i = 0; i < cgraph->n_nodes; i++) {
        struct ggml_tensor * parent = cgraph->nodes[i];
        struct ggml_tensor * grad = ggml_graph_get_grad(cgraph, parent);
xuxzh1's avatar
init  
xuxzh1 committed
6003

xuxzh1's avatar
update  
xuxzh1 committed
6004
6005
        if (grad == node) {
            return parent;
xuxzh1's avatar
init  
xuxzh1 committed
6006
        }
xuxzh1's avatar
update  
xuxzh1 committed
6007
    }
xuxzh1's avatar
init  
xuxzh1 committed
6008

xuxzh1's avatar
update  
xuxzh1 committed
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
    return NULL;
}

static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label)  {
    struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
    struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
    fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
            gparent0 ? (void *) gparent0 : (void *) parent,
            gparent0 ? "g" : "x",
            gparent ? (void *) gparent : (void *) node,
            gparent ? "g" : "x",
            gparent ? "empty" : "vee",
            gparent ? "dashed" : "solid",
            label);
}
xuxzh1's avatar
init  
xuxzh1 committed
6024

xuxzh1's avatar
update  
xuxzh1 committed
6025
6026
6027
6028
6029
6030
static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label)  {
    fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
            (void *) parent, "x",
            (void *) node, "x",
            label);
}
xuxzh1's avatar
init  
xuxzh1 committed
6031

xuxzh1's avatar
update  
xuxzh1 committed
6032
6033
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
    char color[16];
xuxzh1's avatar
init  
xuxzh1 committed
6034

xuxzh1's avatar
update  
xuxzh1 committed
6035
6036
    FILE * fp = ggml_fopen(filename, "w");
    GGML_ASSERT(fp);
xuxzh1's avatar
init  
xuxzh1 committed
6037

xuxzh1's avatar
update  
xuxzh1 committed
6038
6039
6040
    fprintf(fp, "digraph G {\n");
    fprintf(fp, "  newrank = true;\n");
    fprintf(fp, "  rankdir = TB;\n");
xuxzh1's avatar
init  
xuxzh1 committed
6041

xuxzh1's avatar
update  
xuxzh1 committed
6042
6043
6044
    for (int i = 0; i < gb->n_nodes; i++) {
        struct ggml_tensor * node = gb->nodes[i];
        struct ggml_tensor * grad = ggml_graph_get_grad(gb, node);
xuxzh1's avatar
init  
xuxzh1 committed
6045

xuxzh1's avatar
update  
xuxzh1 committed
6046
6047
        if (ggml_graph_get_parent(gb, node) != NULL) {
            continue;
xuxzh1's avatar
init  
xuxzh1 committed
6048
6049
        }

xuxzh1's avatar
update  
xuxzh1 committed
6050
6051
6052
6053
6054
        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
            snprintf(color, sizeof(color), "yellow");
        } else if (grad) {
            if (ggml_graph_find(gf, node)) {
                snprintf(color, sizeof(color), "green");
xuxzh1's avatar
init  
xuxzh1 committed
6055
            } else {
xuxzh1's avatar
update  
xuxzh1 committed
6056
                snprintf(color, sizeof(color), "lightblue");
xuxzh1's avatar
init  
xuxzh1 committed
6057
            }
xuxzh1's avatar
update  
xuxzh1 committed
6058
6059
        } else {
            snprintf(color, sizeof(color), "white");
xuxzh1's avatar
init  
xuxzh1 committed
6060
6061
        }

xuxzh1's avatar
update  
xuxzh1 committed
6062
6063
6064
6065
        fprintf(fp, "  \"%p\" [ "
                    "style = filled; fillcolor = %s; shape = record; "
                    "label=\"",
                (void *) node, color);
xuxzh1's avatar
init  
xuxzh1 committed
6066

xuxzh1's avatar
update  
xuxzh1 committed
6067
6068
6069
6070
        if (strlen(node->name) > 0) {
            fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
        } else {
            fprintf(fp, "(%s)|", ggml_type_name(node->type));
xuxzh1's avatar
init  
xuxzh1 committed
6071
6072
        }

xuxzh1's avatar
update  
xuxzh1 committed
6073
6074
6075
6076
        if (ggml_is_matrix(node)) {
            fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
        } else {
            fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
xuxzh1's avatar
init  
xuxzh1 committed
6077
6078
        }

xuxzh1's avatar
update  
xuxzh1 committed
6079
6080
6081
6082
        if (grad) {
            fprintf(fp, " | <g>%s\"; ]\n", ggml_op_symbol(grad->op));
        } else {
            fprintf(fp, "\"; ]\n");
xuxzh1's avatar
init  
xuxzh1 committed
6083
6084
6085
        }
    }

xuxzh1's avatar
update  
xuxzh1 committed
6086
6087
    for (int i = 0; i < gb->n_leafs; i++) {
        struct ggml_tensor * node = gb->leafs[i];
xuxzh1's avatar
init  
xuxzh1 committed
6088

xuxzh1's avatar
update  
xuxzh1 committed
6089
        snprintf(color, sizeof(color), "pink");
xuxzh1's avatar
init  
xuxzh1 committed
6090

xuxzh1's avatar
update  
xuxzh1 committed
6091
6092
6093
6094
        fprintf(fp, "  \"%p\" [ "
                    "style = filled; fillcolor = %s; shape = record; "
                    "label=\"<x>",
                (void *) node, color);
xuxzh1's avatar
init  
xuxzh1 committed
6095

xuxzh1's avatar
update  
xuxzh1 committed
6096
6097
6098
6099
6100
        if (strlen(node->name) > 0) {
            fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
        } else {
            fprintf(fp, "(%s)|", ggml_type_name(node->type));
        }
xuxzh1's avatar
init  
xuxzh1 committed
6101

xuxzh1's avatar
update  
xuxzh1 committed
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
        fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
        if (ggml_nelements(node) < 5 && node->data != NULL) {
            fprintf(fp, " | (");
            for (int j = 0; j < ggml_nelements(node); j++) {
                // FIXME: use ggml-backend to obtain the tensor data
                //if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
                //    fprintf(fp, "%d", ggml_get_i32_1d(node, j));
                //}
                //else if (node->type == GGML_TYPE_F32 ||
                //         node->type == GGML_TYPE_F16 ||
                //         node->type == GGML_TYPE_BF16) {
                //    fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
                //}
                //else
                {
                    fprintf(fp, "#");
                }
                if (j < ggml_nelements(node) - 1) {
                    fprintf(fp, ", ");
                }
            }
            fprintf(fp, ")");
        }
        fprintf(fp, "\"; ]\n");
xuxzh1's avatar
init  
xuxzh1 committed
6126
6127
    }

xuxzh1's avatar
update  
xuxzh1 committed
6128
6129
    for (int i = 0; i < gb->n_nodes; i++) {
        struct ggml_tensor * node = gb->nodes[i];
xuxzh1's avatar
init  
xuxzh1 committed
6130

xuxzh1's avatar
update  
xuxzh1 committed
6131
6132
6133
6134
6135
6136
6137
6138
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            if (node->src[j]) {
                char label[16];
                snprintf(label, sizeof(label), "src %d", j);
                ggml_graph_dump_dot_node_edge(fp, gb, node, node->src[j], label);
            }
        }
    }
xuxzh1's avatar
init  
xuxzh1 committed
6139

xuxzh1's avatar
update  
xuxzh1 committed
6140
6141
    for (int i = 0; i < gb->n_leafs; i++) {
        struct ggml_tensor * node = gb->leafs[i];
xuxzh1's avatar
init  
xuxzh1 committed
6142

xuxzh1's avatar
update  
xuxzh1 committed
6143
6144
6145
6146
6147
6148
6149
        for (int j = 0; j < GGML_MAX_SRC; j++) {
            if (node->src[j]) {
                char label[16];
                snprintf(label, sizeof(label), "src %d", j);
                ggml_graph_dump_dot_leaf_edge(fp, node, node->src[j], label);
            }
        }
xuxzh1's avatar
init  
xuxzh1 committed
6150
6151
    }

xuxzh1's avatar
update  
xuxzh1 committed
6152
    fprintf(fp, "}\n");
xuxzh1's avatar
init  
xuxzh1 committed
6153

xuxzh1's avatar
update  
xuxzh1 committed
6154
    fclose(fp);
xuxzh1's avatar
init  
xuxzh1 committed
6155

xuxzh1's avatar
update  
xuxzh1 committed
6156
    GGML_LOG_INFO("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename);
xuxzh1's avatar
init  
xuxzh1 committed
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
}

////////////////////////////////////////////////////////////////////////////////

void ggml_set_input(struct ggml_tensor * tensor) {
    tensor->flags |= GGML_TENSOR_FLAG_INPUT;
}

void ggml_set_output(struct ggml_tensor * tensor) {
    tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
}

xuxzh1's avatar
update  
xuxzh1 committed
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor) {
    GGML_UNUSED(ctx); // TODO: remove this parameter
    tensor->flags |= GGML_TENSOR_FLAG_PARAM;
}

void ggml_set_loss(struct ggml_tensor * tensor) {
    GGML_ASSERT(ggml_is_scalar(tensor));
    GGML_ASSERT(tensor->type == GGML_TYPE_F32);
    tensor->flags |= GGML_TENSOR_FLAG_LOSS;
}

xuxzh1's avatar
init  
xuxzh1 committed
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
////////////////////////////////////////////////////////////////////////////////

void ggml_quantize_init(enum ggml_type type) {
    ggml_critical_section_start();

    switch (type) {
        case GGML_TYPE_IQ2_XXS:
        case GGML_TYPE_IQ2_XS:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_IQ1_S:
        case GGML_TYPE_IQ1_M:   iq2xs_init_impl(type); break;
        case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
        case GGML_TYPE_IQ3_S:   iq3xs_init_impl(512); break;
        default: // nothing
            break;
    }

    ggml_critical_section_end();
}

void ggml_quantize_free(void) {
    ggml_critical_section_start();

    iq2xs_free_impl(GGML_TYPE_IQ2_XXS);
    iq2xs_free_impl(GGML_TYPE_IQ2_XS);
    iq2xs_free_impl(GGML_TYPE_IQ1_S);
    iq3xs_free_impl(256);

    ggml_critical_section_end();
}

bool ggml_quantize_requires_imatrix(enum ggml_type type) {
    return
        type == GGML_TYPE_IQ2_XXS ||
        type == GGML_TYPE_IQ2_XS  ||
        type == GGML_TYPE_IQ1_S;//   ||
        //type == GGML_TYPE_IQ1_M;
}

size_t ggml_quantize_chunk(
        enum ggml_type   type,
           const float * src,
                  void * dst,
               int64_t   start,
               int64_t   nrows,
               int64_t   n_per_row,
           const float * imatrix) {
    const int64_t n = (int64_t) nrows * n_per_row;

    if (ggml_quantize_requires_imatrix(type)) {
        GGML_ASSERT(imatrix != NULL);
    }

    GGML_ASSERT(start % type_traits[type].blck_size == 0);
    GGML_ASSERT(start % n_per_row == 0);

    ggml_quantize_init(type); // this is noop if already initialized

    const size_t start_row = start / n_per_row;
    const size_t row_size  = ggml_row_size(type, n_per_row);

    size_t result = 0;

    switch (type) {
        case GGML_TYPE_Q4_0:    result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q4_1:    result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q5_0:    result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q5_1:    result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q8_0:    result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q2_K:    result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q3_K:    result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q4_K:    result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q5_K:    result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q6_K:    result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
xuxzh1's avatar
update  
xuxzh1 committed
6254
6255
        case GGML_TYPE_TQ1_0:   result = quantize_tq1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_TQ2_0:   result = quantize_tq2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
xuxzh1's avatar
init  
xuxzh1 committed
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
        case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ2_XS:  result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ3_S:   result = quantize_iq3_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ2_S:   result = quantize_iq2_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ1_S:   result = quantize_iq1_s  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_F16:
            {
                size_t elemsize = sizeof(ggml_fp16_t);
                ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
                result = n * elemsize;
            } break;
        case GGML_TYPE_BF16:
            {
                size_t elemsize = sizeof(ggml_bf16_t);
                ggml_fp32_to_bf16_row_ref(src + start, (ggml_bf16_t *)dst + start, n);
                result = n * elemsize;
            } break;
        case GGML_TYPE_F32:
            {
                size_t elemsize = sizeof(float);
                result = n * elemsize;
                memcpy((uint8_t *)dst + start * elemsize, src + start, result);
            } break;
        default:
            assert(false);
    }

    GGML_ASSERT(result == nrows * row_size);

    return result;
}

////////////////////////////////////////////////////////////////////////////////

struct gguf_str {
    uint64_t n;  // GGUFv2
    char * data;
};

static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
    [GGUF_TYPE_UINT8]   = sizeof(uint8_t),
    [GGUF_TYPE_INT8]    = sizeof(int8_t),
    [GGUF_TYPE_UINT16]  = sizeof(uint16_t),
    [GGUF_TYPE_INT16]   = sizeof(int16_t),
    [GGUF_TYPE_UINT32]  = sizeof(uint32_t),
    [GGUF_TYPE_INT32]   = sizeof(int32_t),
    [GGUF_TYPE_FLOAT32] = sizeof(float),
    [GGUF_TYPE_BOOL]    = sizeof(bool),
    [GGUF_TYPE_STRING]  = sizeof(struct gguf_str),
    [GGUF_TYPE_UINT64]  = sizeof(uint64_t),
    [GGUF_TYPE_INT64]   = sizeof(int64_t),
    [GGUF_TYPE_FLOAT64] = sizeof(double),
    [GGUF_TYPE_ARRAY]   = 0, // undefined
};
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");

static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
    [GGUF_TYPE_UINT8]   = "u8",
    [GGUF_TYPE_INT8]    = "i8",
    [GGUF_TYPE_UINT16]  = "u16",
    [GGUF_TYPE_INT16]   = "i16",
    [GGUF_TYPE_UINT32]  = "u32",
    [GGUF_TYPE_INT32]   = "i32",
    [GGUF_TYPE_FLOAT32] = "f32",
    [GGUF_TYPE_BOOL]    = "bool",
    [GGUF_TYPE_STRING]  = "str",
    [GGUF_TYPE_ARRAY]   = "arr",
    [GGUF_TYPE_UINT64]  = "u64",
    [GGUF_TYPE_INT64]   = "i64",
    [GGUF_TYPE_FLOAT64] = "f64",
};
static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");

union gguf_value {
    uint8_t  uint8;
    int8_t   int8;
    uint16_t uint16;
    int16_t  int16;
    uint32_t uint32;
    int32_t  int32;
    float    float32;
    uint64_t uint64;
    int64_t  int64;
    double   float64;
    bool     bool_;

    struct gguf_str str;

    struct {
        enum gguf_type type;

        uint64_t n;  // GGUFv2
        void * data;
    } arr;
};

struct gguf_kv {
    struct gguf_str key;

    enum  gguf_type  type;
    union gguf_value value;
};

struct gguf_header {
    char magic[4];

    uint32_t version;
    uint64_t n_tensors; // GGUFv2
    uint64_t n_kv;      // GGUFv2
};

struct gguf_tensor_info {
    struct gguf_str name;

    uint32_t n_dims;
    uint64_t ne[GGML_MAX_DIMS];

    enum ggml_type type;

    uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`

    // for writing API
    const void * data;
    size_t size;
};

struct gguf_context {
    struct gguf_header header;

    struct gguf_kv          * kv;
    struct gguf_tensor_info * infos;

    size_t alignment;
    size_t offset;    // offset of `data` from beginning of file
    size_t size;      // size of `data` in bytes

    //uint8_t * padding;
    void * data;
};

static size_t gguf_type_size(enum gguf_type type) {
    GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
    return GGUF_TYPE_SIZE[type];
}

xuxzh1's avatar
update  
xuxzh1 committed
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
static bool gguf_tensor_info_sanitize(struct gguf_tensor_info * info) {
    if (info->n_dims > GGML_MAX_DIMS) {
        fprintf(stderr, "%s: invalid number of dimensions (%" PRIu32 ")\n", __func__, info->n_dims);
        return false;
    }

    if (info->type < 0 || info->type >= GGML_TYPE_COUNT) {
        fprintf(stderr, "%s: invalid type (%d)\n", __func__, info->type);
        return false;
    }

    if (strlen(info->name.data) >= GGML_MAX_NAME) {
        fprintf(stderr, "%s: tensor '%s' name is too long\n", __func__, info->name.data);
        return false;
    }
xuxzh1's avatar
init  
xuxzh1 committed
6423
6424

    for (uint32_t i = 0; i < info->n_dims; ++i) {
xuxzh1's avatar
update  
xuxzh1 committed
6425
6426
6427
6428
        if (info->ne[i] <= 0) {
            fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[i]);
            return false;
        }
xuxzh1's avatar
init  
xuxzh1 committed
6429
6430
6431
    }

    // prevent overflow for total number of elements
xuxzh1's avatar
update  
xuxzh1 committed
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
    if (INT64_MAX/info->ne[1] <= info->ne[0]) {
        fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[1]);
        return false;
    }

    if (INT64_MAX/info->ne[2] <= info->ne[0]*info->ne[1]) {
        fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[2]);
        return false;
    }

    if (INT64_MAX/info->ne[3] <= info->ne[0]*info->ne[1]*info->ne[2]) {
        fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[3]);
        return false;
    }

    return true;
xuxzh1's avatar
init  
xuxzh1 committed
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
}

static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
    const size_t n = fread(dst, 1, size, file);
    *offset += n;
    return n == size;
}

static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
    p->n    = 0;
    p->data = NULL;

    bool ok = true;

    ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset);

    // early exit if string length is invalid, prevents from integer overflow
    if (p->n == SIZE_MAX) {
        fprintf(stderr, "%s: invalid string length (%" PRIu64 ")\n", __func__, p->n);
        return false;
    }

xuxzh1's avatar
update  
xuxzh1 committed
6470
6471
6472
6473
6474
    p->data = calloc(p->n + 1, 1);
    if (!p->data) {
        fprintf(stderr, "%s: failed to allocate memory for string of length %" PRIu64 "\n", __func__, p->n);
        return false;
    }
xuxzh1's avatar
init  
xuxzh1 committed
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507

    ok = ok && gguf_fread_el(file,  p->data, p->n, offset);

    return ok;
}

static void gguf_free_kv(struct gguf_kv * kv) {
    if (kv->key.data) {
        GGML_FREE(kv->key.data);
    }

    if (kv->type == GGUF_TYPE_STRING) {
        if (kv->value.str.data) {
            GGML_FREE(kv->value.str.data);
        }
    }

    if (kv->type == GGUF_TYPE_ARRAY) {
        if (kv->value.arr.data) {
            if (kv->value.arr.type == GGUF_TYPE_STRING) {
                for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
                    struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
                    if (str->data) {
                        GGML_FREE(str->data);
                    }
                }
            }
            GGML_FREE(kv->value.arr.data);
        }
    }
}

struct gguf_context * gguf_init_empty(void) {
xuxzh1's avatar
update  
xuxzh1 committed
6508
6509
6510
6511
6512
    struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context));
    if (!ctx) {
        fprintf(stderr, "%s: failed to allocate memory for context\n", __func__);
        return NULL;
    }
xuxzh1's avatar
init  
xuxzh1 committed
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557

    memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
    ctx->header.version   = GGUF_VERSION;
    ctx->header.n_tensors = 0;
    ctx->header.n_kv      = 0;

    ctx->kv    = NULL;
    ctx->infos = NULL;

    ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
    ctx->offset    = 0;
    ctx->size      = 0;

    ctx->data = NULL;

    return ctx;
}

struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
    FILE * file = ggml_fopen(fname, "rb");
    if (!file) {
        fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
        return NULL;
    }

    // offset from start of file
    size_t offset = 0;

    char magic[4];

    // check the magic before making allocations
    {
        gguf_fread_el(file, &magic, sizeof(magic), &offset);

        for (uint32_t i = 0; i < sizeof(magic); i++) {
            if (magic[i] != GGUF_MAGIC[i]) {
                fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
                fclose(file);
                return NULL;
            }
        }
    }

    bool ok = true;

xuxzh1's avatar
update  
xuxzh1 committed
6558
6559
6560
6561
6562
6563
    struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context));
    if (!ctx) {
        fprintf(stderr, "%s: failed to allocate memory for context\n", __func__);
        fclose(file);
        return NULL;
    }
xuxzh1's avatar
init  
xuxzh1 committed
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601

    // read the header
    {
        strncpy(ctx->header.magic, magic, 4);

        ctx->kv    = NULL;
        ctx->infos = NULL;
        ctx->data  = NULL;

        ok = ok && gguf_fread_el(file, &ctx->header.version,   sizeof(ctx->header.version),   &offset);
        ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
        ok = ok && gguf_fread_el(file, &ctx->header.n_kv,      sizeof(ctx->header.n_kv),      &offset);

        if (ctx->header.version == 1) {
            fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
            fclose(file);
            gguf_free(ctx);
            return NULL;
        }

        // sanity-checks to prevent from integer/buffer overflows

        ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/sizeof(struct gguf_tensor_info));
        ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/ggml_tensor_overhead());
        ok = ok && (ctx->header.n_kv      < (SIZE_MAX/2)/sizeof(struct gguf_kv));

        if (!ok) {
            fprintf(stderr, "%s: failed to read header\n", __func__);
            fclose(file);
            gguf_free(ctx);
            return NULL;
        }
    }

    // read the kv pairs
    {
        const uint64_t n_kv = ctx->header.n_kv;

xuxzh1's avatar
update  
xuxzh1 committed
6602
6603
6604
6605
6606
6607
6608
        ctx->kv = calloc(n_kv, sizeof(struct gguf_kv));
        if (!ctx->kv) {
            fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
            fclose(file);
            gguf_free(ctx);
            return NULL;
        }
xuxzh1's avatar
init  
xuxzh1 committed
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658

        for (uint64_t i = 0; i < n_kv; ++i) {
            struct gguf_kv * kv = &ctx->kv[i];

            //fprintf(stderr, "%s: reading kv %d\n", __func__, i);

            ok = ok && gguf_fread_str(file, &kv->key,                    &offset);
            ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);

            //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);

            switch (kv->type) {
                case GGUF_TYPE_UINT8:   ok = ok && gguf_fread_el (file, &kv->value.uint8,   sizeof(kv->value.uint8),   &offset); break;
                case GGUF_TYPE_INT8:    ok = ok && gguf_fread_el (file, &kv->value.int8,    sizeof(kv->value.int8),    &offset); break;
                case GGUF_TYPE_UINT16:  ok = ok && gguf_fread_el (file, &kv->value.uint16,  sizeof(kv->value.uint16),  &offset); break;
                case GGUF_TYPE_INT16:   ok = ok && gguf_fread_el (file, &kv->value.int16,   sizeof(kv->value.int16),   &offset); break;
                case GGUF_TYPE_UINT32:  ok = ok && gguf_fread_el (file, &kv->value.uint32,  sizeof(kv->value.uint32),  &offset); break;
                case GGUF_TYPE_INT32:   ok = ok && gguf_fread_el (file, &kv->value.int32,   sizeof(kv->value.int32),   &offset); break;
                case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
                case GGUF_TYPE_UINT64:  ok = ok && gguf_fread_el (file, &kv->value.uint64,  sizeof(kv->value.uint64),  &offset); break;
                case GGUF_TYPE_INT64:   ok = ok && gguf_fread_el (file, &kv->value.int64,   sizeof(kv->value.int64),   &offset); break;
                case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
                case GGUF_TYPE_BOOL:    ok = ok && gguf_fread_el (file, &kv->value.bool_,   sizeof(kv->value.bool_),   &offset); break;
                case GGUF_TYPE_STRING:  ok = ok && gguf_fread_str(file, &kv->value.str,                                &offset); break;
                case GGUF_TYPE_ARRAY:
                    {
                        ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
                        ok = ok && gguf_fread_el(file, &kv->value.arr.n,    sizeof(kv->value.arr.n),    &offset);

                        switch (kv->value.arr.type) {
                            case GGUF_TYPE_UINT8:
                            case GGUF_TYPE_INT8:
                            case GGUF_TYPE_UINT16:
                            case GGUF_TYPE_INT16:
                            case GGUF_TYPE_UINT32:
                            case GGUF_TYPE_INT32:
                            case GGUF_TYPE_FLOAT32:
                            case GGUF_TYPE_UINT64:
                            case GGUF_TYPE_INT64:
                            case GGUF_TYPE_FLOAT64:
                            case GGUF_TYPE_BOOL:
                                {
                                    // prevent from integer overflow in the malloc below
                                    if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
                                        fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
                                        fclose(file);
                                        gguf_free(ctx);
                                        return NULL;
                                    }

xuxzh1's avatar
update  
xuxzh1 committed
6659
6660
6661
6662
6663
6664
6665
                                    kv->value.arr.data = calloc(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
                                    if (!kv->value.arr.data) {
                                        fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
                                        fclose(file);
                                        gguf_free(ctx);
                                        return NULL;
                                    }
xuxzh1's avatar
init  
xuxzh1 committed
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678

                                    ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
                                } break;
                            case GGUF_TYPE_STRING:
                                {
                                    // prevent from integer overflow in the malloc below
                                    if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
                                        fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
                                        fclose(file);
                                        gguf_free(ctx);
                                        return NULL;
                                    }

xuxzh1's avatar
update  
xuxzh1 committed
6679
6680
6681
6682
6683
6684
6685
                                    kv->value.arr.data = calloc(kv->value.arr.n, sizeof(struct gguf_str));
                                    if (!kv->value.arr.data) {
                                        fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
                                        fclose(file);
                                        gguf_free(ctx);
                                        return NULL;
                                    }
xuxzh1's avatar
init  
xuxzh1 committed
6686
6687
6688
6689
6690
6691

                                    for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
                                        ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
                                    }
                                } break;
                            case GGUF_TYPE_ARRAY:
xuxzh1's avatar
update  
xuxzh1 committed
6692
6693
6694
6695
6696
                            default:
                                {
                                    fprintf(stderr, "%s: invalid array type %d\n", __func__, kv->value.arr.type);
                                    ok = false;
                                } break;
xuxzh1's avatar
init  
xuxzh1 committed
6697
6698
                        }
                    } break;
xuxzh1's avatar
update  
xuxzh1 committed
6699
6700
6701
6702
6703
                default:
                    {
                        fprintf(stderr, "%s: invalid type %d\n", __func__, kv->type);
                        ok = false;
                    } break;
xuxzh1's avatar
init  
xuxzh1 committed
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
            }

            if (!ok) {
                break;
            }
        }

        if (!ok) {
            fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
            fclose(file);
            gguf_free(ctx);
            return NULL;
        }
    }

    // read the tensor infos
    if (ctx->header.n_tensors > 0) {
xuxzh1's avatar
update  
xuxzh1 committed
6721
6722
6723
6724
6725
6726
6727
        ctx->infos = calloc(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
        if (!ctx->infos) {
            fprintf(stderr, "%s: failed to allocate memory for tensor infos\n", __func__);
            fclose(file);
            gguf_free(ctx);
            return NULL;
        }
xuxzh1's avatar
init  
xuxzh1 committed
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747

        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
            struct gguf_tensor_info * info = &ctx->infos[i];

            for (int j = 0; j < GGML_MAX_DIMS; ++j) {
                info->ne[j] = 1;
            }

            ok = ok && gguf_fread_str(file, &info->name,                          &offset);
            ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims),  &offset);

            ok = ok && (info->n_dims <= GGML_MAX_DIMS);

            for (uint32_t j = 0; j < info->n_dims; ++j) {
                ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
            }

            ok = ok && gguf_fread_el (file, &info->type,   sizeof(info->type),    &offset);
            ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset),  &offset);

xuxzh1's avatar
update  
xuxzh1 committed
6748
            ok = ok && gguf_tensor_info_sanitize(info);
xuxzh1's avatar
init  
xuxzh1 committed
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798

            // make sure there is no duplicated tensor names
            for (uint64_t j = 0; j < i && ok; ++j) {
                if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
                    fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
                    ok = false;
                }
            }

            if (!ok) {
                fprintf(stderr, "%s: failed to read tensor info\n", __func__);
                fclose(file);
                gguf_free(ctx);
                return NULL;
            }
        }
    }

    ctx->alignment = GGUF_DEFAULT_ALIGNMENT;

    int alignment_idx = gguf_find_key(ctx, "general.alignment");
    if (alignment_idx != -1) {
        ctx->alignment = gguf_get_val_u32(ctx, alignment_idx);
    }

    // we require the data section to be aligned, so take into account any padding
    {
        const size_t offset_pad = offset % ctx->alignment;

        if (offset_pad != 0) {
            offset += ctx->alignment - offset_pad;
            fseek(file, offset, SEEK_SET);
        }
    }

    // store the current file offset - this is where the data section starts
    ctx->offset = offset;

    // compute the total size of the data section, taking into account the alignment
    {
        ctx->size = 0;
        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
            struct gguf_tensor_info * info = &ctx->infos[i];

            const int64_t ne =
                (int64_t) info->ne[0] *
                (int64_t) info->ne[1] *
                (int64_t) info->ne[2] *
                (int64_t) info->ne[3];

xuxzh1's avatar
update  
xuxzh1 committed
6799
            if (ggml_blck_size(info->type) == 0 || ne % ggml_blck_size(info->type) != 0) {
xuxzh1's avatar
init  
xuxzh1 committed
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
                fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
                        __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
                fclose(file);
                gguf_free(ctx);
                return NULL;
            }

            const size_t size_cur = ggml_row_size(info->type, ne);

            ctx->size += GGML_PAD(size_cur, ctx->alignment);
        }
    }

    // load the tensor data only if requested
    if (params.ctx != NULL) {
        // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
        // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
        // the ggml_tensor structs to the appropriate locations in the binary blob

        // compute the exact size needed for the new ggml_context
        const size_t mem_size =
            params.no_alloc ?
            (ctx->header.n_tensors    )*ggml_tensor_overhead() :
            (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;

        struct ggml_init_params pdata = {
            .mem_size   = mem_size,
            .mem_buffer = NULL,
            .no_alloc   = params.no_alloc,
        };

        *params.ctx = ggml_init(pdata);
        if (*params.ctx == NULL) {
            fprintf(stderr, "%s: failed to initialize context\n", __func__);
            fclose(file);
            gguf_free(ctx);
            return NULL;
        }

        struct ggml_context * ctx_data = *params.ctx;

        struct ggml_tensor * data = NULL;

        if (!params.no_alloc) {
            data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);

            ok = ok && data != NULL;

            // read the binary blob with the tensor data
            ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);

            if (!ok) {
                fprintf(stderr, "%s: failed to read tensor data\n", __func__);
                fclose(file);
                ggml_free(ctx_data);
                gguf_free(ctx);
                return NULL;
            }

            ctx->data = data->data;
        }

        ggml_set_no_alloc(ctx_data, true);

        // create the tensors
        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
            const int64_t ne[GGML_MAX_DIMS] = {
                ctx->infos[i].ne[0],
                ctx->infos[i].ne[1],
                ctx->infos[i].ne[2],
                ctx->infos[i].ne[3],
            };

            struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);

            ok = ok && cur != NULL;

            if (!ok) {
                break;
            }

            ggml_set_name(cur, ctx->infos[i].name.data);

            // point the data member to the appropriate location in the binary blob using the tensor infos
            if (!params.no_alloc) {
              //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
                cur->data = (char *) data->data + ctx->infos[i].offset;               // offset from data
            }
        }

        if (!ok) {
            fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
            fclose(file);
            ggml_free(ctx_data);
            gguf_free(ctx);
            return NULL;
        }

        ggml_set_no_alloc(ctx_data, params.no_alloc);
    }

    fclose(file);

    return ctx;
}

void gguf_free(struct gguf_context * ctx) {
    if (ctx == NULL) {
        return;
    }

    if (ctx->kv) {
        // free string memory - not great..
        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
            gguf_free_kv(&ctx->kv[i]);
        }

        GGML_FREE(ctx->kv);
    }

    if (ctx->infos) {
        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
            struct gguf_tensor_info * info = &ctx->infos[i];

            if (info->name.data) {
                GGML_FREE(info->name.data);
            }
        }

        GGML_FREE(ctx->infos);
    }

    GGML_FREE(ctx);
}

const char * gguf_type_name(enum gguf_type type) {
    return GGUF_TYPE_NAME[type];
}

int gguf_get_version(const struct gguf_context * ctx) {
    return ctx->header.version;
}

size_t gguf_get_alignment(const struct gguf_context * ctx) {
    return ctx->alignment;
}

size_t gguf_get_data_offset(const struct gguf_context * ctx) {
    return ctx->offset;
}

void * gguf_get_data(const struct gguf_context * ctx) {
    return ctx->data;
}

int gguf_get_n_kv(const struct gguf_context * ctx) {
    return ctx->header.n_kv;
}

int gguf_find_key(const struct gguf_context * ctx, const char * key) {
    // return -1 if key not found
    int keyfound = -1;

    const int n_kv = gguf_get_n_kv(ctx);

    for (int i = 0; i < n_kv; ++i) {
        if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
            keyfound = i;
            break;
        }
    }

    return keyfound;
}

const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    return ctx->kv[key_id].key.data;
}

enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    return ctx->kv[key_id].type;
}

enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
    return ctx->kv[key_id].value.arr.type;
}

const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
    return ctx->kv[key_id].value.arr.data;
}

const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
    struct gguf_kv * kv = &ctx->kv[key_id];
    struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
    return str->data;
}

int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
    return ctx->kv[key_id].value.arr.n;
}

uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
    return ctx->kv[key_id].value.uint8;
}

int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
    return ctx->kv[key_id].value.int8;
}

uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
    return ctx->kv[key_id].value.uint16;
}

int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
    return ctx->kv[key_id].value.int16;
}

uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
    return ctx->kv[key_id].value.uint32;
}

int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
    return ctx->kv[key_id].value.int32;
}

float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
    return ctx->kv[key_id].value.float32;
}

uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
    return ctx->kv[key_id].value.uint64;
}

int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
    return ctx->kv[key_id].value.int64;
}

double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
    return ctx->kv[key_id].value.float64;
}

bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
    return ctx->kv[key_id].value.bool_;
}

const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
    return ctx->kv[key_id].value.str.data;
}

const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
    GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
    return &ctx->kv[key_id].value;
}

int gguf_get_n_tensors(const struct gguf_context * ctx) {
    return ctx->header.n_tensors;
}

int gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
    // return -1 if tensor not found
    int tensorfound = -1;

    const int n_tensors = gguf_get_n_tensors(ctx);

    for (int i = 0; i < n_tensors; ++i) {
        if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
            tensorfound = i;
            break;
        }
    }

    return tensorfound;
}

size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
    return ctx->infos[i].offset;
}

char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
    return ctx->infos[i].name.data;
}

enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
    return ctx->infos[i].type;
}

// returns the index
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
    const int idx = gguf_find_key(ctx, key);
    if (idx >= 0) {
        return idx;
    }

    const int n_kv = gguf_get_n_kv(ctx);

    ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
    ctx->kv[n_kv].key.n    = strlen(key);
    ctx->kv[n_kv].key.data = strdup(key);
    ctx->header.n_kv++;

    return n_kv;
}

void gguf_remove_key(struct gguf_context * ctx, const char * key) {
    const int idx = gguf_find_key(ctx, key);
    if (idx >= 0) {
        const int n_kv = gguf_get_n_kv(ctx);
        gguf_free_kv(&ctx->kv[idx]);
        for (int i = idx; i < n_kv-1; ++i) {
            ctx->kv[i] = ctx->kv[i+1];
        }
        ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct gguf_kv));
        ctx->header.n_kv--;
    }
}

void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type        = GGUF_TYPE_UINT8;
    ctx->kv[idx].value.uint8 = val;
}

void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type       = GGUF_TYPE_INT8;
    ctx->kv[idx].value.int8 = val;
}

void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type         = GGUF_TYPE_UINT16;
    ctx->kv[idx].value.uint16 = val;
}

void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type        = GGUF_TYPE_INT16;
    ctx->kv[idx].value.int16 = val;
}

void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type         = GGUF_TYPE_UINT32;
    ctx->kv[idx].value.uint32 = val;
}

void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type        = GGUF_TYPE_INT32;
    ctx->kv[idx].value.int32 = val;
}

void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type          = GGUF_TYPE_FLOAT32;
    ctx->kv[idx].value.float32 = val;
}

void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type         = GGUF_TYPE_UINT64;
    ctx->kv[idx].value.uint64 = val;
}

void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type        = GGUF_TYPE_INT64;
    ctx->kv[idx].value.int64 = val;
}

void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type          = GGUF_TYPE_FLOAT64;
    ctx->kv[idx].value.float64 = val;
}

void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type        = GGUF_TYPE_BOOL;
    ctx->kv[idx].value.bool_ = val;
}

void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type           = GGUF_TYPE_STRING;
    ctx->kv[idx].value.str.n    = strlen(val);
    ctx->kv[idx].value.str.data = strdup(val);
}

void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
    ctx->kv[idx].value.arr.type = type;
    ctx->kv[idx].value.arr.n    = n;
    ctx->kv[idx].value.arr.data = GGML_CALLOC(n, gguf_type_size(type));
    memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
}

void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
    const int idx = gguf_get_or_add_key(ctx, key);

    ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
    ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
    ctx->kv[idx].value.arr.n    = n;
    ctx->kv[idx].value.arr.data = GGML_CALLOC(n, sizeof(struct gguf_str));
    for (int i = 0; i < n; i++) {
        struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
        str->n    = strlen(data[i]);
        str->data = strdup(data[i]);
    }
}

// set or add KV pairs from another context
void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
    for (uint32_t i = 0; i < src->header.n_kv; i++) {
        switch (src->kv[i].type) {
            case GGUF_TYPE_UINT8:   gguf_set_val_u8  (ctx, src->kv[i].key.data, src->kv[i].value.uint8);    break;
            case GGUF_TYPE_INT8:    gguf_set_val_i8  (ctx, src->kv[i].key.data, src->kv[i].value.int8);     break;
            case GGUF_TYPE_UINT16:  gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16);   break;
            case GGUF_TYPE_INT16:   gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16);    break;
            case GGUF_TYPE_UINT32:  gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32);   break;
            case GGUF_TYPE_INT32:   gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32);    break;
            case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32);  break;
            case GGUF_TYPE_UINT64:  gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64);   break;
            case GGUF_TYPE_INT64:   gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64);    break;
            case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64);  break;
            case GGUF_TYPE_BOOL:    gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_);    break;
            case GGUF_TYPE_STRING:  gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
            case GGUF_TYPE_ARRAY:
                {
                    if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
                        const char ** data = GGML_CALLOC(src->kv[i].value.arr.n, sizeof(char *));
                        for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
                            data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
                        }
                        gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
                        GGML_FREE((void *)data);
                    } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
                        GGML_ABORT("nested arrays not supported");
                    } else {
                        gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
                    }
                } break;
            default: GGML_ABORT("invalid type");
        }
    }
}

void gguf_add_tensor(
             struct gguf_context * ctx,
        const struct ggml_tensor * tensor) {
xuxzh1's avatar
update  
xuxzh1 committed
7300
    GGML_ASSERT(tensor);
xuxzh1's avatar
init  
xuxzh1 committed
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
    if (gguf_find_tensor(ctx, tensor->name) != -1) {
        GGML_ABORT("duplicated tensor name");
    }

    const int idx = ctx->header.n_tensors;
    ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));

    ctx->infos[idx].name.n    = strlen(tensor->name);
    ctx->infos[idx].name.data = strdup(tensor->name);

    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
        ctx->infos[idx].ne[i] = 1;
    }

    ctx->infos[idx].n_dims = ggml_n_dims(tensor);
    for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
        ctx->infos[idx].ne[i] = tensor->ne[i];
    }

    ctx->infos[idx].type   = tensor->type;
    ctx->infos[idx].offset = 0;
    ctx->infos[idx].data   = tensor->data;
    ctx->infos[idx].size   = ggml_nbytes(tensor);

    if (ctx->header.n_tensors > 0) {
        ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
    }

    ctx->header.n_tensors++;
}

void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
    const int idx = gguf_find_tensor(ctx, name);
    if (idx < 0) {
        GGML_ABORT("tensor not found");
    }

    ctx->infos[idx].type = type;
}

void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
    const int idx = gguf_find_tensor(ctx, name);
    if (idx < 0) {
        GGML_ABORT("tensor not found");
    }

    ctx->infos[idx].data = data;
    ctx->infos[idx].size = size;

    // update offsets
    for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
        ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
    }
}

//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
//    fwrite(&val->n,   sizeof(val->n),    1, file);
//    fwrite(val->data, sizeof(char), val->n, file);
//}
//
//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
//    fwrite(val, sizeof(char), size, file);
//}

struct gguf_buf {
    void * data;
    size_t size;
    size_t offset;
};

static struct gguf_buf gguf_buf_init(size_t size) {
    struct gguf_buf buf = {
        /*buf.data   =*/ size == 0 ? NULL : GGML_CALLOC(1, size),
        /*buf.size   =*/ size,
        /*buf.offset =*/ 0,
    };

    return buf;
}

static void gguf_buf_free(struct gguf_buf buf) {
    if (buf.data) {
        GGML_FREE(buf.data);
    }
}

static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
    if (buf->offset + size > buf->size) {
        buf->size = 1.5*(buf->offset + size);
        if (buf->data) {
            buf->data = realloc(buf->data, buf->size);
        }
    }
}

static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
    gguf_buf_grow(buf, sizeof(val->n) + val->n);

    if (buf->data) {
        memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
    }
    buf->offset += sizeof(val->n);

    if (buf->data) {
        memcpy((char *) buf->data + buf->offset, val->data, val->n);
    }
    buf->offset += val->n;
}

static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
    gguf_buf_grow(buf, el_size);

    if (buf->data) {
        memcpy((char *) buf->data + buf->offset, val, el_size);
    }
    buf->offset += el_size;
}

static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
    // write header
    gguf_bwrite_el(buf, &ctx->header.magic,     sizeof(ctx->header.magic));
    gguf_bwrite_el(buf, &ctx->header.version,   sizeof(ctx->header.version));
    gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
    gguf_bwrite_el(buf, &ctx->header.n_kv,      sizeof(ctx->header.n_kv));

    // write key-value pairs
    for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
        struct gguf_kv * kv = &ctx->kv[i];

        gguf_bwrite_str(buf, &kv->key);
        gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));

        switch (kv->type) {
            case GGUF_TYPE_UINT8:   gguf_bwrite_el( buf, &kv->value.uint8,   sizeof(kv->value.uint8)  ); break;
            case GGUF_TYPE_INT8:    gguf_bwrite_el (buf, &kv->value.int8,    sizeof(kv->value.int8)   ); break;
            case GGUF_TYPE_UINT16:  gguf_bwrite_el (buf, &kv->value.uint16,  sizeof(kv->value.uint16) ); break;
            case GGUF_TYPE_INT16:   gguf_bwrite_el (buf, &kv->value.int16,   sizeof(kv->value.int16)  ); break;
            case GGUF_TYPE_UINT32:  gguf_bwrite_el (buf, &kv->value.uint32,  sizeof(kv->value.uint32) ); break;
            case GGUF_TYPE_INT32:   gguf_bwrite_el (buf, &kv->value.int32,   sizeof(kv->value.int32)  ); break;
            case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
            case GGUF_TYPE_UINT64:  gguf_bwrite_el (buf, &kv->value.uint64,  sizeof(kv->value.uint64) ); break;
            case GGUF_TYPE_INT64:   gguf_bwrite_el (buf, &kv->value.int64,   sizeof(kv->value.int64)  ); break;
            case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
            case GGUF_TYPE_BOOL:    gguf_bwrite_el (buf, &kv->value.bool_,   sizeof(kv->value.bool_)  ); break;
            case GGUF_TYPE_STRING:  gguf_bwrite_str(buf, &kv->value.str                               ); break;
            case GGUF_TYPE_ARRAY:
                {
                    gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
                    gguf_bwrite_el(buf, &kv->value.arr.n,    sizeof(kv->value.arr.n)   );

                    switch (kv->value.arr.type) {
                        case GGUF_TYPE_UINT8:
                        case GGUF_TYPE_INT8:
                        case GGUF_TYPE_UINT16:
                        case GGUF_TYPE_INT16:
                        case GGUF_TYPE_UINT32:
                        case GGUF_TYPE_INT32:
                        case GGUF_TYPE_FLOAT32:
                        case GGUF_TYPE_UINT64:
                        case GGUF_TYPE_INT64:
                        case GGUF_TYPE_FLOAT64:
                        case GGUF_TYPE_BOOL:
                            {
                                gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type));
                            } break;
                        case GGUF_TYPE_STRING:
                            {
                                for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
                                    gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
                                }
                            } break;
                        case GGUF_TYPE_ARRAY:
                        default: GGML_ABORT("invalid type");
                    }
                } break;
            default: GGML_ABORT("invalid type");
        }
    }

    // write tensor infos
    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
        struct gguf_tensor_info * info = &ctx->infos[i];

        gguf_bwrite_str(buf, &info->name);
        gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
        for (uint32_t j = 0; j < info->n_dims; ++j) {
            gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
        }
        gguf_bwrite_el(buf, &info->type,   sizeof(info->type));
        gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
    }

    // we require the data section to be aligned, so take into account any padding
    {
        const size_t offset     = buf->offset;
        const size_t offset_pad = GGML_PAD(offset, ctx->alignment);

        if (offset_pad != offset) {
            uint8_t pad = 0;
            for (size_t i = 0; i < offset_pad - offset; ++i) {
                gguf_bwrite_el(buf, &pad, sizeof(pad));
            }
        }
    }

    if (only_meta) {
        return;
    }

    size_t offset = 0;

    // write tensor data
    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
        struct gguf_tensor_info * info = &ctx->infos[i];

        const size_t size     = info->size;
        const size_t size_pad = GGML_PAD(size, ctx->alignment);

        gguf_bwrite_el(buf, info->data, size);

        if (size_pad != size) {
            uint8_t pad = 0;
            for (size_t j = 0; j < size_pad - size; ++j) {
                gguf_bwrite_el(buf, &pad, sizeof(pad));
            }
        }

        GGML_ASSERT(offset == info->offset);

        offset += size_pad;
    }
}

void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
    FILE * file = ggml_fopen(fname, "wb");
    if (!file) {
        GGML_ABORT("failed to open file for writing");
    }

    struct gguf_buf buf = gguf_buf_init(16*1024);

    gguf_write_to_buf(ctx, &buf, only_meta);

    fwrite(buf.data, 1, buf.offset, file);

    gguf_buf_free(buf);

    fclose(file);
}

size_t gguf_get_meta_size(const struct gguf_context * ctx) {
    // no allocs - only compute size
    struct gguf_buf buf = gguf_buf_init(0);

    gguf_write_to_buf(ctx, &buf, true);

    return buf.offset;
}

void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
    struct gguf_buf buf = gguf_buf_init(16*1024);

    gguf_write_to_buf(ctx, &buf, true);

    memcpy(data, buf.data, buf.offset);

    gguf_buf_free(buf);
}

xuxzh1's avatar
update  
xuxzh1 committed
7570
7571
7572
void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
    g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
    g_logger_state.log_callback_user_data = user_data;
xuxzh1's avatar
init  
xuxzh1 committed
7573
}