LlamaDecoderSelfAttentionLayer.h 3.74 KB
Newer Older
Li Zhang's avatar
Li Zhang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/*
 * Copyright (c) OpenMMLab. All rights reserved.
 * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

AllentDan's avatar
AllentDan committed
18
// Modified from
lvhan028's avatar
lvhan028 committed
19
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/DecoderSelfAttentionLayer.h
Li Zhang's avatar
Li Zhang committed
20
21
22

#pragma once

lvhan028's avatar
lvhan028 committed
23
24
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/models/llama/LlamaLinear.h"
25
#include "src/turbomind/models/llama/llama_params.h"
lvhan028's avatar
lvhan028 committed
26
#include "src/turbomind/utils/Tensor.h"
Li Zhang's avatar
Li Zhang committed
27
#include "src/turbomind/utils/cuda_utils.h"
lvhan028's avatar
lvhan028 committed
28
#include "src/turbomind/utils/nccl_utils.h"
Li Zhang's avatar
Li Zhang committed
29

lvhan028's avatar
lvhan028 committed
30
namespace turbomind {
Li Zhang's avatar
Li Zhang committed
31
32
33
34
35

template<typename T>
class LlamaDecoderSelfAttentionLayer {
public:
    void freeBuffer();
Li Zhang's avatar
Li Zhang committed
36
    void allocateBuffer(size_t batch_size);
Li Zhang's avatar
Li Zhang committed
37

38
39
40
41
42
43
44
45
46
    LlamaDecoderSelfAttentionLayer(size_t                      head_num,
                                   size_t                      kv_head_num,
                                   size_t                      size_per_head,
                                   const LlamaAttentionParams& attn_params,
                                   NcclParam                   tensor_para,
                                   cudaStream_t                stream,
                                   cublasMMWrapper*            cublas_wrapper,
                                   IAllocator*                 allocator,
                                   bool                        is_free_buffer_after_forward,
Li Zhang's avatar
Li Zhang committed
47
                                   int                         cache_block_seq_len,
48
                                   int                         quant_policy):
Li Zhang's avatar
Li Zhang committed
49
        head_num_(head_num),
50
        kv_head_num_(kv_head_num),
Li Zhang's avatar
Li Zhang committed
51
52
53
        size_per_head_(size_per_head),
        hidden_units_(head_num * size_per_head),
        local_head_num_(head_num / tensor_para.world_size_),
54
        local_kv_head_num_(kv_head_num_ / tensor_para.world_size_),
Li Zhang's avatar
Li Zhang committed
55
        local_hidden_units_(hidden_units_ / tensor_para.world_size_),
56
        params_(attn_params),
Li Zhang's avatar
Li Zhang committed
57
58
59
60
        tensor_para_(tensor_para),
        stream_(stream),
        linear_(cublas_wrapper, stream),
        allocator_(allocator),
Li Zhang's avatar
Li Zhang committed
61
        kv_cache_block_len_(cache_block_seq_len),
62
63
        is_free_buffer_after_forward_(is_free_buffer_after_forward),
        quant_policy_(quant_policy)
Li Zhang's avatar
Li Zhang committed
64
    {
Li Zhang's avatar
Li Zhang committed
65
        arch_ = getSMVersion();
Li Zhang's avatar
Li Zhang committed
66
67
68
69
70
71
72
73
74
75
76
    }

    ~LlamaDecoderSelfAttentionLayer()
    {
        freeBuffer();
    }

    void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaAttentionWeight<T>* weights);

private:
    const size_t head_num_;
77
    const size_t kv_head_num_;
Li Zhang's avatar
Li Zhang committed
78
79
80
    const size_t size_per_head_;
    const size_t hidden_units_;
    const size_t local_head_num_;
81
    const size_t local_kv_head_num_;
Li Zhang's avatar
Li Zhang committed
82
    const size_t local_hidden_units_;
Li Zhang's avatar
Li Zhang committed
83
    const size_t kv_cache_block_len_;
Li Zhang's avatar
Li Zhang committed
84
    const bool   is_free_buffer_after_forward_;
85
    const int    quant_policy_;
Li Zhang's avatar
Li Zhang committed
86

87
    const LlamaAttentionParams& params_;
Li Zhang's avatar
Li Zhang committed
88
89
90
91
92
93
94
95
96
97

    NcclParam tensor_para_;

    cudaStream_t   stream_;
    IAllocator*    allocator_;
    LlamaLinear<T> linear_;

    T* qkv_buf_     = nullptr;
    T* context_buf_ = nullptr;

Li Zhang's avatar
Li Zhang committed
98
99
100
    static constexpr int kMaxSplitK = 16;  // must be <= WARP_SIZE
    float*               workspace_ = nullptr;

Li Zhang's avatar
Li Zhang committed
101
    bool is_allocate_buffer_{};
Li Zhang's avatar
Li Zhang committed
102
    int  arch_{};
Li Zhang's avatar
Li Zhang committed
103
104
};

lvhan028's avatar
lvhan028 committed
105
}  // namespace turbomind