LlamaWeight.cc 6.48 KB
Newer Older
Li Zhang's avatar
Li Zhang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/*
 * Copyright (c) OpenMMLab. All rights reserved.
 * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

Li Zhang's avatar
Li Zhang committed
18
// Modified from
lvhan028's avatar
lvhan028 committed
19
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptWeight.cc
Li Zhang's avatar
Li Zhang committed
20

lvhan028's avatar
lvhan028 committed
21
#include "src/turbomind/models/llama/LlamaWeight.h"
Li Zhang's avatar
Li Zhang committed
22

lvhan028's avatar
lvhan028 committed
23
namespace turbomind {
Li Zhang's avatar
Li Zhang committed
24
25

template<typename T>
26
27
28
LlamaWeight<T>::LlamaWeight(size_t     head_num,
                            size_t     kv_head_num,
                            size_t     size_per_head,
Li Zhang's avatar
Li Zhang committed
29
30
31
                            size_t     inter_size,
                            size_t     vocab_size,
                            size_t     num_layer,
Li Zhang's avatar
Li Zhang committed
32
                            bool       attn_bias,
33
34
                            WeightType weight_type,
                            int        group_size,
gaoqiong's avatar
gaoqiong committed
35
                            int        w4_weight_layout,
Li Zhang's avatar
Li Zhang committed
36
                            size_t     tensor_para_size,
37
                            size_t     tensor_para_rank):
38
    hidden_units_(head_num * size_per_head),
Li Zhang's avatar
Li Zhang committed
39
40
    inter_size_(inter_size),
    vocab_size_(vocab_size),
41
    vocab_size_padded_(vocab_size),
Li Zhang's avatar
Li Zhang committed
42
43
44
    num_layer_(num_layer),
    weight_type_(weight_type),
    tensor_para_size_(tensor_para_size),
45
    tensor_para_rank_(tensor_para_rank)
Li Zhang's avatar
Li Zhang committed
46
{
47
48
49
50
    if (vocab_size_padded_ % tensor_para_size_ != 0) {
        vocab_size_padded_ = (vocab_size_padded_ + tensor_para_size_ - 1) / tensor_para_size_ * tensor_para_size_;
        TM_LOG_WARNING("pad vocab size from %d to %d", vocab_size_, vocab_size_padded_);
    }
Li Zhang's avatar
Li Zhang committed
51
52
    decoder_layer_weights.reserve(num_layer_);
    for (unsigned l = 0; l < num_layer_; ++l) {
53
54
55
56
57
        decoder_layer_weights.push_back(new LlamaDecoderLayerWeight<T>(head_num,
                                                                       kv_head_num,
                                                                       size_per_head,
                                                                       inter_size_,
                                                                       weight_type_,
58
                                                                       group_size,
gaoqiong's avatar
gaoqiong committed
59
                                                                       w4_weight_layout,
60
61
62
                                                                       attn_bias,
                                                                       tensor_para_size_,
                                                                       tensor_para_rank_));
Li Zhang's avatar
Li Zhang committed
63
    }
gaoqiong's avatar
gaoqiong committed
64
65
66
    // 这同样会将环境变量 MY_VARIABLE 设置为 my_value,并且最后一个参数 1 表示如果变量已经存在,是否覆盖。如果为 1,则会覆盖原有的值;如果为 0,则不会覆盖,保持原有的值不变。
 
    char* env_name ="LMDEPLOY_WEIGHTLAYOUT_SWITCH";
Li Zhang's avatar
Li Zhang committed
67

gaoqiong's avatar
gaoqiong committed
68
69
70
71
    if(weight_type_ ==WeightType::kINT4){
        std::string str_w4_weight_layout = std::to_string(w4_weight_layout);
        const char* env_value = str_w4_weight_layout.c_str();
        setenv(env_name,env_value , 1);
72
        //printf("set LMDEPLOY_WEIGHTLAYOUT_SWITCH env: %d \n",w4_weight_layout);
gaoqiong's avatar
gaoqiong committed
73
74
75
76
77
78
    }
    else
    {
        std::string str_w4_weight_layout = std::to_string(-1);
        const char* env_value = str_w4_weight_layout.c_str();
        setenv(env_name,env_value , 1);
79
        //printf("set LMDEPLOY_WEIGHTLAYOUT_SWITCH env: %d \n",-1);
gaoqiong's avatar
gaoqiong committed
80
    }
Li Zhang's avatar
Li Zhang committed
81
82
83
84
85
86
87
88
89
90
91
92
    mallocWeights();
}

template<typename T>
LlamaWeight<T>::~LlamaWeight()
{
    cudaFree((void*)pre_decoder_embedding_table);
    cudaFree((void*)output_norm_weight);
    cudaFree((void*)post_decoder_embedding_kernel);

    pre_decoder_embedding_table   = nullptr;
    post_decoder_embedding_kernel = nullptr;
Lyu Han's avatar
Lyu Han committed
93
94
95
96

    for (auto& p : decoder_layer_weights) {
        delete p;
    }
Li Zhang's avatar
Li Zhang committed
97
98
99
100
101
}

template<typename T>
void LlamaWeight<T>::mallocWeights()
{
102
    deviceMalloc((T**)&pre_decoder_embedding_table, vocab_size_padded_ * hidden_units_);
Li Zhang's avatar
Li Zhang committed
103
    deviceMalloc((T**)&output_norm_weight, hidden_units_);
104
    deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_padded_);
Li Zhang's avatar
Li Zhang committed
105
106
107
108
109
110
}

template<typename T>
void LlamaWeight<T>::loadModel(std::string dir_path)
{
    FtCudaDataType model_file_type = FtCudaDataType::FP16;
q.yao's avatar
q.yao committed
111
112
113
    if(weight_type_ == WeightType::kBF16){
        model_file_type = FtCudaDataType::BF16;
    }
Li Zhang's avatar
Li Zhang committed
114
115
116
    dir_path += '/';

    loadWeightFromBin((T*)pre_decoder_embedding_table,
117
                      {vocab_size_padded_ * hidden_units_},
Li Zhang's avatar
Li Zhang committed
118
119
120
121
122
                      dir_path + "tok_embeddings.weight",
                      model_file_type);

    loadWeightFromBin((T*)output_norm_weight, {hidden_units_}, dir_path + "norm.weight", model_file_type);

123
124
125
126
    loadWeightFromBin((T*)post_decoder_embedding_kernel,
                      {hidden_units_ * vocab_size_padded_},
                      dir_path + "output.weight",
                      model_file_type);
Li Zhang's avatar
Li Zhang committed
127
128
129
130
131
132

    for (unsigned layer = 0; layer < num_layer_; ++layer) {
        decoder_layer_weights[layer]->loadModel(dir_path + "layers." + std::to_string(layer), model_file_type);
    }
}

133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
template<typename T>
TensorMap LlamaWeight<T>::getParams()
{
    TensorMap output;

    output.insert(
        "tok_embeddings.weight",
        Tensor{MEMORY_GPU, getTensorType<T>(), {vocab_size_ * hidden_units_ * sizeof(T)}, pre_decoder_embedding_table});

    output.insert("norm.weight",
                  Tensor{MEMORY_GPU, getTensorType<T>(), {hidden_units_ * sizeof(T)}, output_norm_weight});

    output.insert(
        "output.weight",
        Tensor{
            MEMORY_GPU, getTensorType<T>(), {hidden_units_ * vocab_size_ * sizeof(T)}, post_decoder_embedding_kernel});

    // transformer layers
    for (size_t i = 0; i < num_layer_; i++) {
        std::string prefix = fmtstr("layers.%d", i);
        TensorMap   layeri = decoder_layer_weights[i]->getParams(prefix);
        for (auto [name, tensor] : layeri) {
            output.insert(name, tensor);
        }
    }

    return output;
}

Li Zhang's avatar
Li Zhang committed
162
163
template struct LlamaWeight<float>;
template struct LlamaWeight<half>;
q.yao's avatar
q.yao committed
164
165
166
#ifdef ENABLE_BF16
template struct LlamaWeight<__nv_bfloat16>;
#endif
Li Zhang's avatar
Li Zhang committed
167

lvhan028's avatar
lvhan028 committed
168
}  // namespace turbomind