ext_server.h 3.53 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#if defined(LLAMA_SERVER_LIBRARY)
#ifndef LLAMA_SERVER_H
#define LLAMA_SERVER_H
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>

int __main(int argc, char **argv);

// This exposes extern C entrypoints into the llama_server
// To enable the server compile with LLAMA_SERVER_LIBRARY

#ifdef __cplusplus
extern "C" {
#endif
typedef struct ext_server_resp {
  int id;          // < 0 on error
  size_t msg_len;  // caller must allocate msg and set msg_len
  char *msg;
} ext_server_resp_t;

// Allocated and freed by caller
typedef struct ext_server_lora_adapter {
  char *adapter;
  float scale;
  struct ext_server_lora_adapter *next;
} ext_server_lora_adapter_t;

// Allocated and freed by caller
typedef struct ext_server_params {
  char *model;
  uint32_t n_ctx;         // token context window, 0 = from model
  uint32_t n_batch;       // prompt processing maximum batch size
  uint32_t n_threads;     // number of threads to use for generation
  int32_t n_parallel;     // number of parallel sequences to decodewra
  float rope_freq_base;   // RoPE base frequency, 0 = from model
  float rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
  bool memory_f16;        // use f16 instead of f32 for memory kv
  int32_t n_gpu_layers;  // number of layers to store in VRAM (-1 - use default)
  int32_t main_gpu;      // the GPU that is used for scratch and small tensors
  bool use_mlock;        // force system to keep model in RAM
  bool use_mmap;         // use mmap if possible
  bool numa;             // attempt optimizations that help on some NUMA systems
  bool embedding;        // get only sentence embedding
  ext_server_lora_adapter_t *lora_adapters;
  char *mmproj;
Daniel Hiltgen's avatar
Daniel Hiltgen committed
48
  bool verbose_logging;  // Enable verbose logging of the server
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
} ext_server_params_t;

typedef struct ext_server_task_result {
  int id;
  bool stop;
  bool error;
  char *json_resp;  // null terminated, memory managed by ext_server
} ext_server_task_result_t;

// Initialize the server once per process
// err->id = 0 for success and err->msg[0] = NULL
// err->id != 0 for failure, and err->msg contains error message
void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err);

// Run the main loop, called once per init
void llama_server_start();
// Stop the main loop and free up resources allocated in init and start.  Init
// must be called again to reuse
void llama_server_stop();

// json_req null terminated string, memory managed by caller
// resp->id >= 0 on success (task ID)
// resp->id < 0 on error, and resp->msg contains error message
void llama_server_completion(const char *json_req, ext_server_resp_t *resp);

// Caller must call llama_server_release_task_result to free resp->json_resp
void llama_server_completion_next_result(const int task_id,
                                         ext_server_task_result_t *result);
void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err);
void llama_server_release_task_result(ext_server_task_result_t *result);

// Caller must call llama_server_releaes_json_resp to free json_resp if err.id <
// 0
void llama_server_tokenize(const char *json_req, char **json_resp,
                           ext_server_resp_t *err);
void llama_server_detokenize(const char *json_req, char **json_resp,
                             ext_server_resp_t *err);
void llama_server_embedding(const char *json_req, char **json_resp,
                            ext_server_resp_t *err);
void llama_server_release_json_resp(char **json_resp);

#ifdef __cplusplus
}
#endif

#endif
#endif  // LLAMA_SERVER_LIBRARY