LlamaV2.h 7.96 KB
Newer Older
Li Zhang's avatar
Li Zhang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/*
 * Copyright (c) OpenMMLab. All rights reserved.
 * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

AllentDan's avatar
AllentDan committed
19
// Modified from
lvhan028's avatar
lvhan028 committed
20
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGpt.h
Li Zhang's avatar
Li Zhang committed
21
22
23

#pragma once

lvhan028's avatar
lvhan028 committed
24
25
26
27
28
29
30
#include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/models/llama/Barrier.h"
#include "src/turbomind/models/llama/LlamaBatch.h"
#include "src/turbomind/models/llama/LlamaContextDecoder.h"
#include "src/turbomind/models/llama/LlamaDecoder.h"
#include "src/turbomind/models/llama/LlamaWeight.h"
#include "src/turbomind/models/llama/Request.h"
Li Zhang's avatar
Li Zhang committed
31
32
#include "src/turbomind/models/llama/SequenceManager.h"
#include "src/turbomind/models/llama/llama_params.h"
lvhan028's avatar
lvhan028 committed
33
34
35
36
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/instance_comm.h"
#include "src/turbomind/utils/nccl_utils.h"
Li Zhang's avatar
Li Zhang committed
37
38
#include <unordered_map>

Chen Xin's avatar
Chen Xin committed
39
40
using ffi_api_lock_ctrl_t = std::function<void(int)>;

lvhan028's avatar
lvhan028 committed
41
namespace turbomind {
Li Zhang's avatar
Li Zhang committed
42
43
44
45
46
47
48
49
50

template<typename T>
class LlamaV2 {
public:
    struct SharedState {
        std::vector<std::shared_ptr<Request>> infer_requests;
        std::vector<std::shared_ptr<Request>> stop_requests;
        RequestQueue                          request_queue;
        std::shared_ptr<Barrier>              barrier;
Li Zhang's avatar
Li Zhang committed
51
        bool                                  abort;
Li Zhang's avatar
Li Zhang committed
52
53
54
55
56
    };

    ~LlamaV2();

    LlamaV2(size_t                       head_num,
57
            size_t                       kv_head_num,
Li Zhang's avatar
Li Zhang committed
58
59
60
61
            size_t                       size_per_head,
            size_t                       inter_size,
            size_t                       num_layer,
            size_t                       vocab_size,
62
            const LlamaAttentionParams&  attn_params,
Li Zhang's avatar
Li Zhang committed
63
64
65
66
67
68
69
            float                        norm_eps,
            int                          max_batch_size,
            int                          max_context_token_num,
            int                          session_len,
            int                          step_length,
            int                          start_id,
            int                          end_id,
Li Zhang's avatar
Li Zhang committed
70
71
            float                        cache_max_block_count,
            int                          cache_block_seq_len,
Li Zhang's avatar
Li Zhang committed
72
            int                          cache_chunk_size,
73
            int                          quant_policy,
Li Zhang's avatar
Li Zhang committed
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
            bool                         use_context_fmha,
            std::shared_ptr<SharedState> shared_state,
            LlamaWeight<T>*              weights,
            NcclParam                    tensor_para,
            cudaStream_t                 stream,
            cublasMMWrapper*             cublas_wrapper,
            IAllocator*                  allocator,
            bool                         is_free_buffer_after_forward,
            cudaDeviceProp*              cuda_device_prop);

    struct Control {
        AbstractInstanceComm* comm;
        Request::Callback     callback;
    };

    void forward(std::unordered_map<std::string, Tensor>*       outputs,
                 const std::unordered_map<std::string, Tensor>* inputs,
                 Control                                        control);

    void stop(const std::vector<uint64_t>& seq_ids);

95
96
97
98
99
    size_t vocab_size() const noexcept
    {
        return vocab_size_;
    }

Chen Xin's avatar
Chen Xin committed
100
101
102
103
104
    void setFfiLock(ffi_api_lock_ctrl_t func)
    {
        ffi_lock_ = func;
    }

Li Zhang's avatar
Li Zhang committed
105
106
107
private:
    friend class Batch;

Li Zhang's avatar
Li Zhang committed
108
109
110
111
112
    void initialize(const LlamaAttentionParams& attn_params,
                    size_t                      kv_head_num,
                    bool                        use_context_fmha,
                    int                         cache_block_seq_len,
                    int                         quant_policy);
Li Zhang's avatar
Li Zhang committed
113
114
115

    void embeddingLookup(T* embeddings, const int* token_ids_buf, int batch_size, int step);

q.yao's avatar
q.yao committed
116
    void contextDecode(T*           decoder_output,
Li Zhang's avatar
Li Zhang committed
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
                       uintptr_t*   k_block_ptrs,
                       uintptr_t*   v_block_ptrs,
                       void**       k_tmp_ptrs,
                       void**       v_tmp_ptrs,
                       T*           context_decoder_input_buf,
                       T*           context_decoder_output_buf,
                       const int*   input_ids,
                       const int*   input_length,
                       const int*   context_length,
                       const int*   cu_block_counts,
                       const float* rope_theta,
                       size_t       token_num,
                       size_t       max_input_len,
                       size_t       max_context_len,
                       size_t       session_len,
                       size_t       batch_size);

    void decoderForward(T*           decoder_output,
                        uintptr_t*   k_cache_ptr,
                        uintptr_t*   v_cache_ptr,
                        T*           decoder_input,
                        const int*   sequence_length,
                        const bool*  finished,
                        const int*   cu_block_counts,
                        const float* rope_theta,
                        int          step,
                        int          ite,
                        int          sum_seq_len,
                        int          max_seq_len,
                        size_t       batch_size);
Li Zhang's avatar
Li Zhang committed
147
148
149
150
151
152
153

    void postDecodeEmbedding(float* logits, float* local_logits, const T* decoder_output, int batch_size);

    void dynamicDecode(int*            token_ids,
                       bool*           finished,
                       int*            sequence_length,
                       bool*           should_stop,
Li Zhang's avatar
Li Zhang committed
154
                       curandState_t*  curand_state,
Li Zhang's avatar
Li Zhang committed
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
                       TensorMap*      inputs,
                       TensorMap*      outputs,
                       const float*    logits,
                       const uint32_t* seq_limit_len,
                       const int*      context_length,
                       const int*      end_ids,
                       int             step,
                       int             ite,
                       size_t          max_context_len,
                       size_t          token_ids_len,
                       size_t          batch_size);

private:
    friend class LlamaBatch<T>;

    const size_t head_num_;
    const size_t size_per_head_;
    const size_t inter_size_;
    const size_t num_layer_;
    const size_t vocab_size_;
175
    size_t       vocab_size_padded_;
Li Zhang's avatar
Li Zhang committed
176
177
    float        rmsnorm_eps_ = 1e-6f;

Li Zhang's avatar
Li Zhang committed
178
179
    const LlamaAttentionParams attn_params_;

Li Zhang's avatar
Li Zhang committed
180
181
182
183
184
185
186
    static constexpr bool neox_rotary_style_ = false;

    const int    start_id_;
    const int    end_id_;
    const size_t hidden_units_;

    const size_t local_head_num_;
Li Zhang's avatar
Li Zhang committed
187
    const size_t local_kv_head_num_;
Li Zhang's avatar
Li Zhang committed
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
    NcclParam    tensor_para_;

    cudaStream_t     stream_;
    cublasMMWrapper* cublas_wrapper_;
    IAllocator*      allocator_;
    bool             is_free_buffer_after_forward_;
    cudaDeviceProp*  cuda_device_prop_;

    const bool debug_{false};

    LlamaWeight<T>*            weights_{};
    LlamaDecoder<T>*           decoder_{};
    LlamaContextDecoder<T>*    context_decoder_{};
    DynamicDecodeLayer<float>* dynamic_decode_layer_{};

Li Zhang's avatar
Li Zhang committed
203
204
205
206
    const int                      step_length_;
    std::shared_ptr<SharedState>   shared_state_;
    ffi_api_lock_ctrl_t            ffi_lock_;
    std::unique_ptr<LlamaBatch<T>> batch_;
Li Zhang's avatar
Li Zhang committed
207
208
};

lvhan028's avatar
lvhan028 committed
209
}  // namespace turbomind