llama_params.h 668 Bytes
Newer Older
1
2
3
4
5
6
7
// Copyright (c) OpenMMLab. All rights reserved.

#pragma once

namespace turbomind {

struct LlamaAttentionParams {
Li Zhang's avatar
Li Zhang committed
8
    int   rotary_embedding_dim;
Lyu Han's avatar
Lyu Han committed
9
10
    float rotary_embedding_base;
    int   max_position_embeddings;
Li Zhang's avatar
Li Zhang committed
11
12
13
    float rope_scaling_factor;
    // bool  use_dynamic_ntk;
    bool use_logn_attn;
14
15
};

16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
struct EngineParams {
    // batch params
    int max_batch_size;
    int session_len;
    int step_length;

    // cache params
    float cache_max_block_count;
    int   cache_chunk_size;

    // chunking params
    int max_context_token_num;
    int num_tokens_per_iter;
    int extra_tokens_per_iter;
    int max_prefill_iters;
};

33
}  // namespace turbomind