ggml-backend-impl.h 9.87 KB
Newer Older
xuxzh1's avatar
init  
xuxzh1 committed
1
2
3
4
5
6
7
8
9
10
11
#pragma once

// ggml-backend internal header

#include "ggml-backend.h"

#ifdef  __cplusplus
extern "C" {
#endif

    //
xuxzh1's avatar
update  
xuxzh1 committed
12
    // Backend buffer type
xuxzh1's avatar
init  
xuxzh1 committed
13
14
15
    //

    struct ggml_backend_buffer_type_i {
xuxzh1's avatar
update  
xuxzh1 committed
16
        const char *          (*get_name)      (ggml_backend_buffer_type_t buft);
xuxzh1's avatar
init  
xuxzh1 committed
17
        // allocate a buffer of this type
xuxzh1's avatar
update  
xuxzh1 committed
18
        ggml_backend_buffer_t (*alloc_buffer)  (ggml_backend_buffer_type_t buft, size_t size);
xuxzh1's avatar
init  
xuxzh1 committed
19
        // tensor alignment
xuxzh1's avatar
update  
xuxzh1 committed
20
21
22
23
24
25
26
        size_t                (*get_alignment) (ggml_backend_buffer_type_t buft);
        // (optional) max buffer size that can be allocated (defaults to SIZE_MAX)
        size_t                (*get_max_size)  (ggml_backend_buffer_type_t buft);
        // (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
        size_t                (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
        // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
        bool                  (*is_host)       (ggml_backend_buffer_type_t buft);
xuxzh1's avatar
init  
xuxzh1 committed
27
28
29
30
    };

    struct ggml_backend_buffer_type {
        struct ggml_backend_buffer_type_i  iface;
xuxzh1's avatar
update  
xuxzh1 committed
31
32
        ggml_backend_dev_t device;
        void * context;
xuxzh1's avatar
init  
xuxzh1 committed
33
34
    };

xuxzh1's avatar
update  
xuxzh1 committed
35
36
37
    //
    // Backend buffer
    //
xuxzh1's avatar
init  
xuxzh1 committed
38
39

    struct ggml_backend_buffer_i {
xuxzh1's avatar
update  
xuxzh1 committed
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
        // (optional) free the buffer
        void         (*free_buffer)  (ggml_backend_buffer_t buffer);
        // base address of the buffer
        void *       (*get_base)     (ggml_backend_buffer_t buffer);
        // (optional) initialize a tensor in the buffer (eg. add tensor extras)
        void         (*init_tensor)  (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
        // tensor data access
        void         (*memset_tensor)(ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
        void         (*set_tensor)   (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
        void         (*get_tensor)   (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
        // (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported)
        bool         (*cpy_tensor)   (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
        // clear the entire buffer
        void         (*clear)        (ggml_backend_buffer_t buffer, uint8_t value);
        // (optional) reset any internal state due to tensor initialization, such as tensor extras
        void         (*reset)        (ggml_backend_buffer_t buffer);
xuxzh1's avatar
init  
xuxzh1 committed
56
57
58
59
60
    };

    struct ggml_backend_buffer {
        struct ggml_backend_buffer_i  iface;
        ggml_backend_buffer_type_t    buft;
xuxzh1's avatar
update  
xuxzh1 committed
61
        void * context;
xuxzh1's avatar
init  
xuxzh1 committed
62
63
64
65
        size_t size;
        enum ggml_backend_buffer_usage usage;
    };

xuxzh1's avatar
update  
xuxzh1 committed
66
67
68
69
70
    ggml_backend_buffer_t ggml_backend_buffer_init(
                   ggml_backend_buffer_type_t buft,
            struct ggml_backend_buffer_i      iface,
                   void *                     context,
                   size_t                     size);
xuxzh1's avatar
init  
xuxzh1 committed
71
72
73
74

    // do not use directly, use ggml_backend_tensor_copy instead
    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);

xuxzh1's avatar
update  
xuxzh1 committed
75
    // multi-buffer
xuxzh1's avatar
init  
xuxzh1 committed
76
    // buffer that contains a collection of buffers
xuxzh1's avatar
update  
xuxzh1 committed
77
78
79
    ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
    bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
    void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
xuxzh1's avatar
init  
xuxzh1 committed
80
81

    //
xuxzh1's avatar
update  
xuxzh1 committed
82
    // Backend (stream)
xuxzh1's avatar
init  
xuxzh1 committed
83
84
85
    //

    struct ggml_backend_i {
xuxzh1's avatar
update  
xuxzh1 committed
86
        const char * (*get_name)(ggml_backend_t backend);
xuxzh1's avatar
init  
xuxzh1 committed
87

xuxzh1's avatar
update  
xuxzh1 committed
88
        void (*free)(ggml_backend_t backend);
xuxzh1's avatar
init  
xuxzh1 committed
89
90

        // (optional) asynchronous tensor data access
xuxzh1's avatar
update  
xuxzh1 committed
91
92
93
        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
        bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
xuxzh1's avatar
init  
xuxzh1 committed
94

xuxzh1's avatar
update  
xuxzh1 committed
95
96
        // (optional) complete all pending operations (required if the backend supports async operations)
        void (*synchronize)(ggml_backend_t backend);
xuxzh1's avatar
init  
xuxzh1 committed
97

xuxzh1's avatar
update  
xuxzh1 committed
98
99
100
101
        // (optional) graph plans (not used currently)
        // compute graph with a plan
        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
        void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
xuxzh1's avatar
init  
xuxzh1 committed
102
        // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
xuxzh1's avatar
update  
xuxzh1 committed
103
        void                      (*graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
xuxzh1's avatar
init  
xuxzh1 committed
104
        // compute the graph with the plan
xuxzh1's avatar
update  
xuxzh1 committed
105
        enum ggml_status          (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
xuxzh1's avatar
init  
xuxzh1 committed
106

xuxzh1's avatar
update  
xuxzh1 committed
107
108
        // compute graph (always async if supported by the backend)
        enum ggml_status          (*graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
xuxzh1's avatar
init  
xuxzh1 committed
109
110

        // (optional) event synchronization
xuxzh1's avatar
update  
xuxzh1 committed
111
112
113
114
        // record an event on this stream
        void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
        // wait for an event on on a different stream
        void (*event_wait)  (ggml_backend_t backend, ggml_backend_event_t event);
xuxzh1's avatar
init  
xuxzh1 committed
115
116
117
118
119
    };

    struct ggml_backend {
        ggml_guid_t guid;
        struct ggml_backend_i iface;
xuxzh1's avatar
update  
xuxzh1 committed
120
121
        ggml_backend_dev_t device;
        void * context;
xuxzh1's avatar
init  
xuxzh1 committed
122
123
124
    };

    struct ggml_backend_event {
xuxzh1's avatar
update  
xuxzh1 committed
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
        struct ggml_backend_device * device;
        void * context;
    };

    //
    // Backend device
    //

    // Note: if additional properties are needed, we should add a struct with all of them
    //       the current functions to obtain the properties can remain, since they are more convenient for often used properties
    struct ggml_backend_device_i {
        // device name: short identifier for this device, such as "CPU" or "CUDA0"
        const char * (*get_name)(ggml_backend_dev_t dev);

        // device description: short informative description of the device, could be the model name
        const char * (*get_description)(ggml_backend_dev_t dev);

        // device memory in bytes
        void         (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);

        // device type
        enum ggml_backend_dev_type (*get_type)(ggml_backend_dev_t dev);

        // device properties
        void (*get_props)(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props);

        // backend (stream) initialization
        ggml_backend_t (*init_backend)(ggml_backend_dev_t dev, const char * params);

        // preferred buffer type
        ggml_backend_buffer_type_t (*get_buffer_type)(ggml_backend_dev_t dev);

        // (optional) host buffer type (in system memory, typically this is a pinned memory buffer for faster transfers between host and device)
        ggml_backend_buffer_type_t (*get_host_buffer_type)(ggml_backend_dev_t dev);

        // (optional) buffer from pointer: create a buffer from a host pointer (useful for memory mapped models and importing data from other libraries)
        ggml_backend_buffer_t (*buffer_from_host_ptr)(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size);

        // check if the backend can compute an operation
        bool (*supports_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);

        // check if the backend can use tensors allocated in a buffer type
        bool (*supports_buft)(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft);

        // (optional) check if the backend wants to run an operation, even if the weights are allocated in an incompatible buffer
        // these should be expensive operations that may benefit from running on this backend instead of the CPU backend
        bool (*offload_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);

        // (optional) event synchronization
        ggml_backend_event_t (*event_new)         (ggml_backend_dev_t dev);
        void                 (*event_free)        (ggml_backend_dev_t dev, ggml_backend_event_t event);
        void                 (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
    };

    struct ggml_backend_device {
        struct ggml_backend_device_i iface;
        ggml_backend_reg_t reg;
xuxzh1's avatar
init  
xuxzh1 committed
182
183
184
185
        void * context;
    };

    //
xuxzh1's avatar
update  
xuxzh1 committed
186
    // Backend (reg)
xuxzh1's avatar
init  
xuxzh1 committed
187
188
    //

xuxzh1's avatar
update  
xuxzh1 committed
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
    struct ggml_backend_reg_i {
        const char * (*get_name)(ggml_backend_reg_t reg);

        // enumerate available devices
        size_t             (*get_device_count)(ggml_backend_reg_t reg);
        ggml_backend_dev_t (*get_device)(ggml_backend_reg_t reg, size_t index);

        // (optional) get a pointer to a function in the backend
        // backends can add custom functions that are not part of the standard ggml-backend interface
        void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name);
    };

    struct ggml_backend_reg {
        // int api_version; // TODO: for dynamic loading
        struct ggml_backend_reg_i iface;
        void * context;
    };

xuxzh1's avatar
init  
xuxzh1 committed
207

xuxzh1's avatar
update  
xuxzh1 committed
208
209
210
211
212
    // Internal backend registry API
    void ggml_backend_register(ggml_backend_reg_t reg);
    void ggml_backend_device_register(ggml_backend_dev_t device);
    // TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
    // typedef ggml_backend_register_t * (*ggml_backend_init)(void);
xuxzh1's avatar
init  
xuxzh1 committed
213
214
215
216

#ifdef  __cplusplus
}
#endif