/* * Copyright (c) OpenMMLab. All rights reserved. * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // Modified from // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptWeight.cc #include "src/turbomind/models/llama/LlamaWeight.h" namespace turbomind { template LlamaWeight::LlamaWeight(size_t head_num, size_t kv_head_num, size_t size_per_head, size_t inter_size, size_t vocab_size, size_t num_layer, bool attn_bias, WeightType weight_type, int group_size, int w4_weight_layout, size_t tensor_para_size, size_t tensor_para_rank): hidden_units_(head_num * size_per_head), inter_size_(inter_size), vocab_size_(vocab_size), vocab_size_padded_(vocab_size), num_layer_(num_layer), weight_type_(weight_type), tensor_para_size_(tensor_para_size), tensor_para_rank_(tensor_para_rank) { if (vocab_size_padded_ % tensor_para_size_ != 0) { vocab_size_padded_ = (vocab_size_padded_ + tensor_para_size_ - 1) / tensor_para_size_ * tensor_para_size_; TM_LOG_WARNING("pad vocab size from %d to %d", vocab_size_, vocab_size_padded_); } decoder_layer_weights.reserve(num_layer_); for (unsigned l = 0; l < num_layer_; ++l) { decoder_layer_weights.push_back(new LlamaDecoderLayerWeight(head_num, kv_head_num, size_per_head, inter_size_, weight_type_, group_size, w4_weight_layout, attn_bias, tensor_para_size_, tensor_para_rank_)); } // 这同样会将环境变量 MY_VARIABLE 设置为 my_value,并且最后一个参数 1 表示如果变量已经存在,是否覆盖。如果为 1,则会覆盖原有的值;如果为 0,则不会覆盖,保持原有的值不变。 char* env_name ="LMDEPLOY_WEIGHTLAYOUT_SWITCH"; if(weight_type_ ==WeightType::kINT4){ std::string str_w4_weight_layout = std::to_string(w4_weight_layout); const char* env_value = str_w4_weight_layout.c_str(); setenv(env_name,env_value , 1); //printf("set LMDEPLOY_WEIGHTLAYOUT_SWITCH env: %d \n",w4_weight_layout); } else { std::string str_w4_weight_layout = std::to_string(-1); const char* env_value = str_w4_weight_layout.c_str(); setenv(env_name,env_value , 1); //printf("set LMDEPLOY_WEIGHTLAYOUT_SWITCH env: %d \n",-1); } mallocWeights(); } template LlamaWeight::~LlamaWeight() { cudaFree((void*)pre_decoder_embedding_table); cudaFree((void*)output_norm_weight); cudaFree((void*)post_decoder_embedding_kernel); pre_decoder_embedding_table = nullptr; post_decoder_embedding_kernel = nullptr; for (auto& p : decoder_layer_weights) { delete p; } } template void LlamaWeight::mallocWeights() { deviceMalloc((T**)&pre_decoder_embedding_table, vocab_size_padded_ * hidden_units_); deviceMalloc((T**)&output_norm_weight, hidden_units_); deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_padded_); } template void LlamaWeight::loadModel(std::string dir_path) { FtCudaDataType model_file_type = FtCudaDataType::FP16; if(weight_type_ == WeightType::kBF16){ model_file_type = FtCudaDataType::BF16; } dir_path += '/'; loadWeightFromBin((T*)pre_decoder_embedding_table, {vocab_size_padded_ * hidden_units_}, dir_path + "tok_embeddings.weight", model_file_type); loadWeightFromBin((T*)output_norm_weight, {hidden_units_}, dir_path + "norm.weight", model_file_type); loadWeightFromBin((T*)post_decoder_embedding_kernel, {hidden_units_ * vocab_size_padded_}, dir_path + "output.weight", model_file_type); for (unsigned layer = 0; layer < num_layer_; ++layer) { decoder_layer_weights[layer]->loadModel(dir_path + "layers." + std::to_string(layer), model_file_type); } } template TensorMap LlamaWeight::getParams() { TensorMap output; output.insert( "tok_embeddings.weight", Tensor{MEMORY_GPU, getTensorType(), {vocab_size_ * hidden_units_ * sizeof(T)}, pre_decoder_embedding_table}); output.insert("norm.weight", Tensor{MEMORY_GPU, getTensorType(), {hidden_units_ * sizeof(T)}, output_norm_weight}); output.insert( "output.weight", Tensor{ MEMORY_GPU, getTensorType(), {hidden_units_ * vocab_size_ * sizeof(T)}, post_decoder_embedding_kernel}); // transformer layers for (size_t i = 0; i < num_layer_; i++) { std::string prefix = fmtstr("layers.%d", i); TensorMap layeri = decoder_layer_weights[i]->getParams(prefix); for (auto [name, tensor] : layeri) { output.insert(name, tensor); } } return output; } template struct LlamaWeight; template struct LlamaWeight; #ifdef ENABLE_BF16 template struct LlamaWeight<__nv_bfloat16>; #endif } // namespace turbomind