FfnLayer.h 8.03 KB
Newer Older
Li Zhang's avatar
Li Zhang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/*
 * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

lvhan028's avatar
lvhan028 committed
19
20
21
22
23
24
25
26
27
28
#include "src/turbomind/kernels/activation_kernels.h"
#include "src/turbomind/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
#include "src/turbomind/kernels/cutlass_kernels/int8_gemm/int8_gemm.h"
#include "src/turbomind/kernels/matrix_vector_multiplication.h"
#include "src/turbomind/kernels/moe_kernels.h"
#include "src/turbomind/layers/BaseLayer.h"
#include "src/turbomind/layers/FfnWeight.h"
#include "src/turbomind/utils/activation_types.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
Li Zhang's avatar
Li Zhang committed
29
30
31
#include <stdint.h>
#include <vector>

lvhan028's avatar
lvhan028 committed
32
namespace turbomind {
Li Zhang's avatar
Li Zhang committed
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124

template<typename T>
class FfnLayer: public BaseLayer {
private:
    // buffer handling
    size_t max_token_num_ = 0;

    // meta data
    size_t head_num_;       // (martinma): this member is not used in this class. Remove it?
    size_t size_per_head_;  // (martinma): this member is not used in this class. Remove it?
    size_t expert_num_;

    // calculated data
    size_t hidden_units_;

    // gated activation
    bool use_gated_activation_;

    std::shared_ptr<CutlassMoeFCRunner<T, T>>       moe_fc_runner_;
    std::shared_ptr<CutlassMoeFCRunner<T, uint8_t>> moe_int8_weight_only_fc_runner_;

    std::shared_ptr<CutlassFpAIntBGemmRunner<T, uint8_t>> weight_only_int8_fc_runner_;
    std::shared_ptr<CutlassInt8GemmRunner<T>>             int8_fc_runner_;

    void allocateBuffer() override;
    void freeBuffer() override;
    void allocateBuffer(int moe_k = 0, bool use_moe = false);
    void allocateBuffer(size_t token_num, int moe_k = 0, bool use_moe = false);

protected:
    T*    inter_buf_        = nullptr;
    T*    inter_buf_2_      = nullptr;  // for gated activation
    T*    moe_gates_buf_    = nullptr;
    char* moe_fc_workspace_ = nullptr;

    char*  mixed_gemm_workspace_ = nullptr;
    size_t mixed_gemm_ws_bytes_  = 0;
    char*  int8_gemm_workspace_  = nullptr;
    size_t int8_gemm_ws_bytes_   = 0;

    size_t inter_size_;
    /* used to allocater memory buffers
       different ffn layers (inter_size) will
       reuse the same ffn layer with the max inter size.
       max_inter_size will be passed as inter_size when initializing the ffn layer
    */
    size_t max_inter_size_;

    // int8_mode_ == 0 means we don't use any mechanism related to INT8.
    // int8_mode_ == 1 for weight quantized only gemm for GPT
    // int8_mode_ == 2 for SmoothQuant O3 (per tensor scales)
    int int8_mode_ = 0;

    virtual ActivationType getActivationType() const
    {
        return ActivationType::InvalidType;
    };

    void genericActivation(int          m,
                           const T*     bias1,
                           const T*     bias2,
                           const int*   ia3_tasks,
                           const T*     ia3_weights,
                           const float* activation_in,
                           const float* activation_out,
                           const int*   padding_offset,
                           const int    seq_len);

public:
    FfnLayer(size_t           max_batch_size,
             size_t           max_seq_len,
             size_t           head_num,       // (martinma): redundant parameter?
             size_t           size_per_head,  // (martinma): redundant parameter?
             size_t           expert_num,
             size_t           inter_size,
             cudaStream_t     stream,
             cublasMMWrapper* cublas_wrapper,
             IAllocator*      allocator,
             bool             is_free_buffer_after_forward,
             bool             sparse               = false,
             int              int8_mode            = 0,
             bool             use_gated_activation = false);

    FfnLayer(FfnLayer<T> const& ffn_layer);

    virtual ~FfnLayer();

    void resetInterSize(size_t runtime_inter_size)
    {
        inter_size_ = runtime_inter_size;
    }

lvhan028's avatar
lvhan028 committed
125
126
    virtual void forward(std::vector<turbomind::Tensor>*       output_tensors,
                         const std::vector<turbomind::Tensor>* input_tensors,
Li Zhang's avatar
Li Zhang committed
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
                         const FfnWeight<T>*                           ffn_weights);
    virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors, const FfnWeight<T>* ffn_weights);
};

template<typename T>
class GeluFfnLayer: public FfnLayer<T> {
public:
    GeluFfnLayer(size_t           max_batch_size,
                 size_t           max_seq_len,
                 size_t           head_num,
                 size_t           size_per_head,
                 size_t           expert_num,
                 size_t           inter_size,
                 cudaStream_t     stream,
                 cublasMMWrapper* cublas_wrapper,
                 IAllocator*      allocator,
                 bool             is_free_buffer_after_forward,
                 bool             sparse               = false,
                 int              int8_mode            = 0,
                 bool             use_gated_activation = false);

    GeluFfnLayer(GeluFfnLayer<T> const& ffn_layer);

    virtual ~GeluFfnLayer() = default;

protected:
    using FfnLayer<T>::stream_;
    virtual ActivationType getActivationType() const override
    {
        return ActivationType::Gelu;
    };

private:
    using FfnLayer<T>::inter_buf_;
    using FfnLayer<T>::inter_buf_2_;
    using FfnLayer<T>::inter_size_;
};

template<typename T>
class ReluFfnLayer: public FfnLayer<T> {
public:
    ReluFfnLayer(size_t           max_batch_size,
                 size_t           max_seq_len,
                 size_t           head_num,
                 size_t           size_per_head,
                 size_t           expert_num,
                 size_t           inter_size,
                 cudaStream_t     stream,
                 cublasMMWrapper* cublas_wrapper,
                 IAllocator*      allocator,
                 bool             is_free_buffer_after_forward,
                 bool             sparse               = false,
                 int              int8_mode            = 0,
                 bool             use_gated_activation = false);

    ReluFfnLayer(ReluFfnLayer<T> const& ffn_layer);

    virtual ~ReluFfnLayer() = default;

protected:
    using FfnLayer<T>::stream_;
    virtual ActivationType getActivationType() const override
    {
        return ActivationType::Relu;
    };

private:
    using FfnLayer<T>::inter_buf_;
    using FfnLayer<T>::inter_buf_2_;
    using FfnLayer<T>::inter_size_;
};

template<typename T>
class SiluFfnLayer: public FfnLayer<T> {
public:
    SiluFfnLayer(size_t           max_batch_size,
                 size_t           max_seq_len,
                 size_t           head_num,
                 size_t           size_per_head,
                 size_t           expert_num,
                 size_t           inter_size,
                 cudaStream_t     stream,
                 cublasMMWrapper* cublas_wrapper,
                 IAllocator*      allocator,
                 bool             is_free_buffer_after_forward,
                 bool             sparse               = false,
                 bool             use_gated_activation = false);

    SiluFfnLayer(SiluFfnLayer<T> const& ffn_layer);

    virtual ~SiluFfnLayer() = default;

protected:
    using FfnLayer<T>::stream_;
    virtual ActivationType getActivationType() const override
    {
        return ActivationType::Silu;
    };

private:
    using FfnLayer<T>::inter_buf_;
    using FfnLayer<T>::inter_buf_2_;
    using FfnLayer<T>::inter_size_;
};

lvhan028's avatar
lvhan028 committed
232
}  // namespace turbomind