llama-cparams.h 1019 Bytes
Newer Older
Michael Yang's avatar
Michael Yang committed
1
2
3
4
5
6
#pragma once

#include "llama.h"

#include <cstdint>

7
8
#define LLAMA_MAX_SEQ 64

Michael Yang's avatar
Michael Yang committed
9
10
11
12
13
struct llama_cparams {
    uint32_t n_ctx;           // context size used during inference
    uint32_t n_batch;
    uint32_t n_ubatch;
    uint32_t n_seq_max;
14
15
    int32_t  n_threads;       // number of threads to use for generation
    int32_t  n_threads_batch; // number of threads to use for batch processing
Michael Yang's avatar
Michael Yang committed
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

    float rope_freq_base;
    float rope_freq_scale;

    uint32_t n_ctx_orig_yarn;
    // These hyperparameters are not exposed in GGUF, because all
    // existing YaRN models use the same values for them.
    float yarn_ext_factor;
    float yarn_attn_factor;
    float yarn_beta_fast;
    float yarn_beta_slow;
    float defrag_thold;

    bool embeddings;
    bool causal_attn;
    bool offload_kqv;
    bool flash_attn;
    bool no_perf;
34
    bool warmup;
35
    bool op_offload;
36
    bool kv_unified;
Michael Yang's avatar
Michael Yang committed
37
38
39
40
41
42

    enum llama_pooling_type pooling_type;

    ggml_backend_sched_eval_callback cb_eval;
    void * cb_eval_user_data;
};