"cmd/dcu-ctk/rootless/main.go" did not exist on "0ccfece1f2af998d4ace2394c1741b0b009f6178"
LlamaContextAttentionLayer.h 5.1 KB
Newer Older
Li Zhang's avatar
Li Zhang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/*
 * Copyright (c) OpenMMLab. All rights reserved.
 * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
 * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
AllentDan's avatar
AllentDan committed
18
19

// Modified from
lvhan028's avatar
lvhan028 committed
20
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/GptContextAttentionLayer.h
Li Zhang's avatar
Li Zhang committed
21
22
23

#pragma once

lvhan028's avatar
lvhan028 committed
24
25
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/models/llama/LlamaLinear.h"
26
#include "src/turbomind/models/llama/llama_params.h"
lvhan028's avatar
lvhan028 committed
27
28
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/nccl_utils.h"
Li Zhang's avatar
Li Zhang committed
29

lvhan028's avatar
lvhan028 committed
30
namespace turbomind {
Li Zhang's avatar
Li Zhang committed
31
32
33
34
35
36
37

template<typename T>
class LlamaContextAttentionLayer {
public:
    void freeBuffer();
    void allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len);

38
39
40
41
42
43
44
45
46
47
    LlamaContextAttentionLayer(size_t               head_num,
                               size_t               kv_head_num,
                               size_t               size_per_head,
                               LlamaAttentionParams attn_params,
                               NcclParam            tensor_para,
                               cudaStream_t         stream,
                               cublasMMWrapper*     cublas_wrapper,
                               IAllocator*          allocator,
                               bool                 is_free_buffer_after_forward,
                               bool                 use_fmha,
Li Zhang's avatar
Li Zhang committed
48
                               int                  cache_block_seq_len,
49
                               int                  quant_policy):
Li Zhang's avatar
Li Zhang committed
50
51
52
53
        head_num_(head_num),
        size_per_head_(size_per_head),
        hidden_units_(head_num * size_per_head),
        local_head_num_(head_num / tensor_para.world_size_),
54
55
        local_kv_head_num_(kv_head_num / tensor_para.world_size_),
        head_n_rep_(head_num / kv_head_num),
56
        params_(attn_params),
Li Zhang's avatar
Li Zhang committed
57
58
59
60
61
        tensor_para_(tensor_para),
        stream_(stream),
        cublas_wrapper_(cublas_wrapper),
        linear_(cublas_wrapper, stream),
        allocator_(allocator),
Li Zhang's avatar
Li Zhang committed
62
        kv_cache_block_len_(cache_block_seq_len),
Li Zhang's avatar
Li Zhang committed
63
        is_free_buffer_after_forward_(is_free_buffer_after_forward),
64
65
        use_fmha_(use_fmha),
        quant_policy_(quant_policy)
Li Zhang's avatar
Li Zhang committed
66
    {
67
        FT_CHECK(head_num % kv_head_num == 0);
Li Zhang's avatar
Li Zhang committed
68
69
70
71
72
73
74
75
76
    }

    void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaAttentionWeight<T>* weights);

    void fusedMultiHeadAttention(T**    key_cache_ptrs,
                                 T**    val_cache_ptrs,
                                 size_t cache_layer_offset,
                                 T*     attention_mask,
                                 int*   cu_seqlens,
77
                                 int*   context_lengths,
Li Zhang's avatar
Li Zhang committed
78
79
80
81
82
                                 int    batch_size,
                                 int    max_q_len,
                                 int    max_k_len,
                                 int    max_seq_len);

83
84
85
86
87
88
89
90
91
92
93
94
95
    void unfusedMultiHeadAttention(T**          key_cache_ptrs,
                                   T**          val_cache_ptrs,
                                   size_t       cache_layer_offset,
                                   const T*     attention_mask,
                                   const int*   padding_offset,
                                   const int*   context_length,
                                   int          batch_size,
                                   int          num_token,
                                   int          max_q_len,
                                   int          max_k_len,
                                   int          max_seq_len,
                                   int          quant_policy,
                                   const float* kv_scale);
Li Zhang's avatar
Li Zhang committed
96
97
98
99
100

private:
    const size_t head_num_;
    const size_t size_per_head_;
    const size_t hidden_units_;
101
    const size_t local_kv_head_num_;
Li Zhang's avatar
Li Zhang committed
102
    const size_t local_head_num_;
103
    const size_t head_n_rep_;
Li Zhang's avatar
Li Zhang committed
104
    const size_t kv_cache_block_len_;
Li Zhang's avatar
Li Zhang committed
105
106
    const bool   is_free_buffer_after_forward_;

107
    const LlamaAttentionParams params_;
Li Zhang's avatar
Li Zhang committed
108
109

    const bool use_fmha_;
AllentDan's avatar
AllentDan committed
110
    const int  quant_policy_;
Li Zhang's avatar
Li Zhang committed
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132

    NcclParam tensor_para_;

    cudaStream_t     stream_;
    IAllocator*      allocator_;
    cublasMMWrapper* cublas_wrapper_;
    LlamaLinear<T>   linear_;

    T*     qkv_buf_{};
    T*     q_buf_2_{};
    T*     k_buf_2_{};
    T*     v_buf_2_{};
    T*     k_cache_buf_{};
    T*     v_cache_buf_{};
    T*     qk_buf_{};
    float* qk_buf_float_{};
    T*     qkv_buf_2_{};
    T*     qkv_buf_3_{};

    bool is_allocate_buffer_ = false;
};

lvhan028's avatar
lvhan028 committed
133
}  // namespace turbomind