llama-cparams.h 996 Bytes
Newer Older
Michael Yang's avatar
Michael Yang committed
1
2
3
4
5
6
#pragma once

#include "llama.h"

#include <cstdint>

Daniel Hiltgen's avatar
Daniel Hiltgen committed
7
#define LLAMA_MAX_SEQ 256
8

Michael Yang's avatar
Michael Yang committed
9
10
11
12
13
struct llama_cparams {
    uint32_t n_ctx;           // context size used during inference
    uint32_t n_batch;
    uint32_t n_ubatch;
    uint32_t n_seq_max;
14
15
    int32_t  n_threads;       // number of threads to use for generation
    int32_t  n_threads_batch; // number of threads to use for batch processing
Michael Yang's avatar
Michael Yang committed
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

    float rope_freq_base;
    float rope_freq_scale;

    uint32_t n_ctx_orig_yarn;
    // These hyperparameters are not exposed in GGUF, because all
    // existing YaRN models use the same values for them.
    float yarn_ext_factor;
    float yarn_attn_factor;
    float yarn_beta_fast;
    float yarn_beta_slow;

    bool embeddings;
    bool causal_attn;
    bool offload_kqv;
    bool flash_attn;
    bool no_perf;
33
    bool warmup;
34
    bool op_offload;
35
    bool kv_unified;
Michael Yang's avatar
Michael Yang committed
36
37
38
39
40
41

    enum llama_pooling_type pooling_type;

    ggml_backend_sched_eval_callback cb_eval;
    void * cb_eval_user_data;
};