LlamaWeight.cc 6.88 KB
Newer Older
Li Zhang's avatar
Li Zhang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/*
 * Copyright (c) OpenMMLab. All rights reserved.
 * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

Li Zhang's avatar
Li Zhang committed
18
// Modified from
lvhan028's avatar
lvhan028 committed
19
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptWeight.cc
Li Zhang's avatar
Li Zhang committed
20

lvhan028's avatar
lvhan028 committed
21
#include "src/turbomind/models/llama/LlamaWeight.h"
Li Zhang's avatar
Li Zhang committed
22

lvhan028's avatar
lvhan028 committed
23
namespace turbomind {
Li Zhang's avatar
Li Zhang committed
24
25

template<typename T>
26
27
28
LlamaWeight<T>::LlamaWeight(size_t     head_num,
                            size_t     kv_head_num,
                            size_t     size_per_head,
Li Zhang's avatar
Li Zhang committed
29
30
31
                            size_t     inter_size,
                            size_t     vocab_size,
                            size_t     num_layer,
Li Zhang's avatar
Li Zhang committed
32
                            bool       attn_bias,
33
34
                            WeightType weight_type,
                            int        group_size,
gaoqiong's avatar
gaoqiong committed
35
                            int        w4_weight_layout,
gaoqiong's avatar
gaoqiong committed
36
                            int        w4_pad_size,
Li Zhang's avatar
Li Zhang committed
37
                            size_t     tensor_para_size,
38
                            size_t     tensor_para_rank):
39
    hidden_units_(head_num * size_per_head),
Li Zhang's avatar
Li Zhang committed
40
41
    inter_size_(inter_size),
    vocab_size_(vocab_size),
42
    vocab_size_padded_(vocab_size),
Li Zhang's avatar
Li Zhang committed
43
44
45
    num_layer_(num_layer),
    weight_type_(weight_type),
    tensor_para_size_(tensor_para_size),
46
    tensor_para_rank_(tensor_para_rank)
Li Zhang's avatar
Li Zhang committed
47
{
48
49
50
51
    if (vocab_size_padded_ % tensor_para_size_ != 0) {
        vocab_size_padded_ = (vocab_size_padded_ + tensor_para_size_ - 1) / tensor_para_size_ * tensor_para_size_;
        TM_LOG_WARNING("pad vocab size from %d to %d", vocab_size_, vocab_size_padded_);
    }
Li Zhang's avatar
Li Zhang committed
52
53
    decoder_layer_weights.reserve(num_layer_);
    for (unsigned l = 0; l < num_layer_; ++l) {
54
55
56
57
58
        decoder_layer_weights.push_back(new LlamaDecoderLayerWeight<T>(head_num,
                                                                       kv_head_num,
                                                                       size_per_head,
                                                                       inter_size_,
                                                                       weight_type_,
59
                                                                       group_size,
gaoqiong's avatar
gaoqiong committed
60
                                                                       w4_weight_layout,
gaoqiong's avatar
gaoqiong committed
61
                                                                       w4_pad_size,
62
63
64
                                                                       attn_bias,
                                                                       tensor_para_size_,
                                                                       tensor_para_rank_));
Li Zhang's avatar
Li Zhang committed
65
    }
gaoqiong's avatar
gaoqiong committed
66
67
68
    // 这同样会将环境变量 MY_VARIABLE 设置为 my_value,并且最后一个参数 1 表示如果变量已经存在,是否覆盖。如果为 1,则会覆盖原有的值;如果为 0,则不会覆盖,保持原有的值不变。
 
    char* env_name ="LMDEPLOY_WEIGHTLAYOUT_SWITCH";
Li Zhang's avatar
Li Zhang committed
69

gaoqiong's avatar
gaoqiong committed
70
71
72
73
    if(weight_type_ ==WeightType::kINT4){
        std::string str_w4_weight_layout = std::to_string(w4_weight_layout);
        const char* env_value = str_w4_weight_layout.c_str();
        setenv(env_name,env_value , 1);
gaoqiong's avatar
gaoqiong committed
74

gaoqiong's avatar
gaoqiong committed
75
76
77
78
79
80
    }
    else
    {
        std::string str_w4_weight_layout = std::to_string(-1);
        const char* env_value = str_w4_weight_layout.c_str();
        setenv(env_name,env_value , 1);
81
        //printf("set LMDEPLOY_WEIGHTLAYOUT_SWITCH env: %d \n",-1);
gaoqiong's avatar
gaoqiong committed
82
    }
Li Zhang's avatar
Li Zhang committed
83
84
85
86
87
88
89
90
91
92
93
94
    mallocWeights();
}

template<typename T>
LlamaWeight<T>::~LlamaWeight()
{
    cudaFree((void*)pre_decoder_embedding_table);
    cudaFree((void*)output_norm_weight);
    cudaFree((void*)post_decoder_embedding_kernel);

    pre_decoder_embedding_table   = nullptr;
    post_decoder_embedding_kernel = nullptr;
Lyu Han's avatar
Lyu Han committed
95
96
97
98

    for (auto& p : decoder_layer_weights) {
        delete p;
    }
Li Zhang's avatar
Li Zhang committed
99
100
101
102
103
}

template<typename T>
void LlamaWeight<T>::mallocWeights()
{
104
    deviceMalloc((T**)&pre_decoder_embedding_table, vocab_size_padded_ * hidden_units_);
Li Zhang's avatar
Li Zhang committed
105
    deviceMalloc((T**)&output_norm_weight, hidden_units_);
106
    deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_padded_);
Li Zhang's avatar
Li Zhang committed
107
108
109
110
111
112
}

template<typename T>
void LlamaWeight<T>::loadModel(std::string dir_path)
{
    FtCudaDataType model_file_type = FtCudaDataType::FP16;
q.yao's avatar
q.yao committed
113
114
115
    if(weight_type_ == WeightType::kBF16){
        model_file_type = FtCudaDataType::BF16;
    }
Li Zhang's avatar
Li Zhang committed
116
117
118
    dir_path += '/';

    loadWeightFromBin((T*)pre_decoder_embedding_table,
119
                      {vocab_size_padded_ * hidden_units_},
Li Zhang's avatar
Li Zhang committed
120
121
122
123
124
                      dir_path + "tok_embeddings.weight",
                      model_file_type);

    loadWeightFromBin((T*)output_norm_weight, {hidden_units_}, dir_path + "norm.weight", model_file_type);

125
126
127
128
    loadWeightFromBin((T*)post_decoder_embedding_kernel,
                      {hidden_units_ * vocab_size_padded_},
                      dir_path + "output.weight",
                      model_file_type);
Li Zhang's avatar
Li Zhang committed
129
130
131
132

    for (unsigned layer = 0; layer < num_layer_; ++layer) {
        decoder_layer_weights[layer]->loadModel(dir_path + "layers." + std::to_string(layer), model_file_type);
    }
gaoqiong's avatar
gaoqiong committed
133
134
135
136
137
138
139
140
141
142
143
144
145
146

}

template<typename T>
void LlamaWeight<T>::modifyModel()
{
    FtCudaDataType model_file_type = FtCudaDataType::FP16;
    if(weight_type_ == WeightType::kBF16){
        model_file_type = FtCudaDataType::BF16;
    }

    for (unsigned layer = 0; layer < num_layer_; ++layer) {
        decoder_layer_weights[layer]->modifyModel(model_file_type);
    }
Li Zhang's avatar
Li Zhang committed
147
148
}

gaoqiong's avatar
gaoqiong committed
149

150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
template<typename T>
TensorMap LlamaWeight<T>::getParams()
{
    TensorMap output;

    output.insert(
        "tok_embeddings.weight",
        Tensor{MEMORY_GPU, getTensorType<T>(), {vocab_size_ * hidden_units_ * sizeof(T)}, pre_decoder_embedding_table});

    output.insert("norm.weight",
                  Tensor{MEMORY_GPU, getTensorType<T>(), {hidden_units_ * sizeof(T)}, output_norm_weight});

    output.insert(
        "output.weight",
        Tensor{
            MEMORY_GPU, getTensorType<T>(), {hidden_units_ * vocab_size_ * sizeof(T)}, post_decoder_embedding_kernel});

    // transformer layers
    for (size_t i = 0; i < num_layer_; i++) {
        std::string prefix = fmtstr("layers.%d", i);
        TensorMap   layeri = decoder_layer_weights[i]->getParams(prefix);
        for (auto [name, tensor] : layeri) {
            output.insert(name, tensor);
        }
    }

    return output;
}

Li Zhang's avatar
Li Zhang committed
179
180
template struct LlamaWeight<float>;
template struct LlamaWeight<half>;
q.yao's avatar
q.yao committed
181
182
183
#ifdef ENABLE_BF16
template struct LlamaWeight<__nv_bfloat16>;
#endif
Li Zhang's avatar
Li Zhang committed
184

lvhan028's avatar
lvhan028 committed
185
}  // namespace turbomind