Unverified Commit 6b00f623 authored by Chen Xin's avatar Chen Xin Committed by GitHub
Browse files

Support loading hf model directly (#685)

* turbomind support export model params

* fix overflow

* support turbomind.from_pretrained

* fix tp

* support AutoModel

* support load kv qparams

* update auto_awq

* udpate docstring

* export lmdeploy version

* update doc

* remove download_hf_repo

* LmdeployForCausalLM -> LmdeployForCausalLM

* refactor turbomind.py

* update comment

* add bfloat16 convert back

* support gradio run_locl load hf

* support resuful api server load hf

* add docs

* support loading previous quantized model

* adapt pr 690

* udpate docs

* not export turbomind config when quantize a model

* check model_name when can not get it from config.json

* update readme

* remove model_name in auto_awq

* update

* update

* udpate

* fix build

* absolute import
parent 42e57c8b
# Copyright (c) OpenMMLab. All rights reserved.
import dataclasses
import json
import logging
import os
from huggingface_hub import hf_hub_download
from transformers.utils import ExplicitEnum
logger = logging.getLogger(__name__)
class ModelSource(ExplicitEnum):
"""Turbomind model source."""
WORKSPACE = 'workspace'
HF_MODEL = 'hf_model'
HF_LMDEPLOY = 'hf_lmdeploy'
def create_hf_download_args(**kwargs) -> dict:
download_kwargs = {
'revision': None,
'cache_dir': None,
'proxies': None,
'resume_download': True,
'force_download': False,
'token': None,
'local_files_only': False
}
for k in download_kwargs.keys():
if k in kwargs:
download_kwargs[k] = kwargs[k]
return download_kwargs
def get_hf_config_path(pretrained_model_name_or_path, **kwargs) -> str:
"""Get local hf config local file path."""
if os.path.exists(pretrained_model_name_or_path):
config_path = os.path.join(pretrained_model_name_or_path,
'config.json')
else:
download_kwargs = create_hf_download_args(**kwargs)
config_path = hf_hub_download(pretrained_model_name_or_path,
'config.json', **download_kwargs)
return config_path
def get_hf_config_content(pretrained_model_name_or_path, **kwargs) -> dict:
"""Get config content of a hf model."""
config_path = get_hf_config_path(pretrained_model_name_or_path, **kwargs)
with open(config_path, 'r') as f:
config = json.load(f)
return config
def get_model_source(pretrained_model_name_or_path: str,
**kwargs) -> ModelSource:
"""Get model source."""
triton_model_path = os.path.join(pretrained_model_name_or_path,
'triton_models')
if os.path.exists(triton_model_path):
return ModelSource.WORKSPACE
config = get_hf_config_content(pretrained_model_name_or_path, **kwargs)
model_source = ModelSource.HF_LMDEPLOY if 'turbomind' in config \
else ModelSource.HF_MODEL
return model_source
def check_tm_model_input(pretrained_model_name_or_path, **kwargs):
"""Check if single input pretrained_model_name_or_path is enough to use."""
if kwargs.get('model_name', None):
return
model_source = get_model_source(pretrained_model_name_or_path, **kwargs)
if model_source == ModelSource.WORKSPACE:
return
config = get_hf_config_content(pretrained_model_name_or_path, **kwargs)
if 'turbomind' in config and config['turbomind']['model_name'] != '':
return
assert (0), '\nCan not get model name from input model, '\
'please supply model name with arg --model-name,' \
'you can list supported models by `lmdeploy list`'
@dataclasses.dataclass
class GenParam:
top_p: float
top_k: float
temperature: float
repetition_penalty: float
sequence_start: bool = False
sequence_end: bool = False
step: int = 0
request_output_len: int = 512
def get_gen_param(cap,
sampling_param,
nth_round,
step,
request_output_len=512,
**kwargs):
"""return parameters used by token generation."""
gen_param = GenParam(**dataclasses.asdict(sampling_param),
request_output_len=request_output_len)
# Fix me later. turbomind.py doesn't support None top_k
if gen_param.top_k is None:
gen_param.top_k = 40
if cap == 'chat':
gen_param.sequence_start = (nth_round == 1)
gen_param.sequence_end = False
gen_param.step = step
else:
gen_param.sequence_start = True
gen_param.sequence_end = True
gen_param.step = 0
return gen_param
...@@ -110,6 +110,47 @@ void mallocWeights(LlamaDenseWeight<T>& weights, bool bias) ...@@ -110,6 +110,47 @@ void mallocWeights(LlamaDenseWeight<T>& weights, bool bias)
} }
} }
template<typename FirstArg, typename... Args>
std::string concat(FirstArg&& first, Args&&... args)
{
std::stringstream stream;
stream << first;
((stream << "." << args), ...);
return stream.str();
}
template<typename T>
void getWeightTensor(LlamaDenseWeight<T>& weights, bool bias, const std::string& prefix, TensorMap& output)
{
auto get_name = [=](const std::string& name) { return concat(prefix, name); };
if (bias) {
output.insert(get_name("bias"),
Tensor{MEMORY_GPU, getTensorType<T>(), {weights.output_dims * sizeof(T)}, weights.bias});
}
const size_t bit_size = getBitSize(weights.type);
if (bit_size >= 16) {
output.insert(get_name("weight"),
Tensor{MEMORY_GPU,
getTensorType<T>(),
{weights.input_dims * weights.output_dims * sizeof(T)},
weights.kernel});
}
else { // int8, int4
const int factor = sizeof(float) * 8 / bit_size;
output.insert(get_name("qweight"),
Tensor{MEMORY_GPU,
TYPE_INT32,
{weights.input_dims * weights.output_dims * sizeof(int) / factor},
weights.kernel});
output.insert(get_name("scales_zeros"),
Tensor{MEMORY_GPU,
getTensorType<T>(),
{weights.input_dims / weights.group_size * weights.output_dims * 2 * sizeof(T)},
weights.scales_and_zeros});
}
}
template<typename T> template<typename T>
void loadWeights(LlamaDenseWeight<T>& w, void loadWeights(LlamaDenseWeight<T>& w,
std::string prefix, std::string prefix,
...@@ -226,6 +267,7 @@ void LlamaDecoderLayerWeight<T>::mallocWeights() ...@@ -226,6 +267,7 @@ void LlamaDecoderLayerWeight<T>::mallocWeights()
turbomind::mallocWeights(self_attn_weights.qkv, attn_bias_); turbomind::mallocWeights(self_attn_weights.qkv, attn_bias_);
turbomind::mallocWeights(self_attn_weights.output, attn_bias_); turbomind::mallocWeights(self_attn_weights.output, attn_bias_);
self_attn_weights.past_kv_scale = {1.f, 0.f, 1.f, 0.f};
if (weight_type_ == WeightType::kINT4) { if (weight_type_ == WeightType::kINT4) {
turbomind::mallocWeights(ffn_weights.fused_gating_intermediate, false); turbomind::mallocWeights(ffn_weights.fused_gating_intermediate, false);
...@@ -294,16 +336,43 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType ...@@ -294,16 +336,43 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
loadWeights(ffn_weights.output, dir_path + ".feed_forward.w2", tensor_para_rank_, type, tensor_para_size_, 0); loadWeights(ffn_weights.output, dir_path + ".feed_forward.w2", tensor_para_rank_, type, tensor_para_size_, 0);
// load kv_cache quant scale // load kv_cache quant scale
// if file not exist, get empty vector
std::string scale_path = dir_path + ".past_kv_scale." + rank_spec + ".weight"; std::string scale_path = dir_path + ".past_kv_scale." + rank_spec + ".weight";
std::ifstream in(scale_path, std::ios::in); std::ifstream in(scale_path, std::ios::in);
if (in.is_open()) { if (in.is_open()) {
in.close(); in.close();
self_attn_weights.past_kv_scale = loadArrayFromBin({4}, scale_path); self_attn_weights.past_kv_scale = loadArrayFromBin({4}, scale_path);
} }
}
template<typename T>
TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
{
TensorMap output;
output.insert(concat(prefix, "attention_norm.weight"),
Tensor{MEMORY_GPU, getTensorType<T>(), {hidden_units_ * sizeof(T)}, self_attn_norm_weights});
output.insert(concat(prefix, "ffn_norm.weight"),
Tensor{MEMORY_GPU, getTensorType<T>(), {hidden_units_ * sizeof(T)}, ffn_norm_weights});
auto get_prefix = [=](std::string_view name) { return concat(prefix, name, tensor_para_rank_); };
getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output);
getWeightTensor(self_attn_weights.output, attn_bias_, get_prefix("attention.wo"), output);
if (weight_type_ == WeightType::kINT4) {
getWeightTensor(ffn_weights.fused_gating_intermediate, false, get_prefix("feed_forward.w13"), output);
}
else { else {
self_attn_weights.past_kv_scale = {1.f, 0.f, 1.f, 0.f}; getWeightTensor(ffn_weights.gating, false, get_prefix("feed_forward.w1"), output);
getWeightTensor(ffn_weights.intermediate, false, get_prefix("feed_forward.w3"), output);
} }
getWeightTensor(ffn_weights.output, false, get_prefix("feed_forward.w2"), output);
output.insert(concat(prefix, "past_kv_scale", tensor_para_rank_, "weight"),
Tensor{MEMORY_CPU, TYPE_FP32, {4 * sizeof(float)}, self_attn_weights.past_kv_scale.data()});
return output;
} }
template struct LlamaDecoderLayerWeight<float>; template struct LlamaDecoderLayerWeight<float>;
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#pragma once #pragma once
#include "src/turbomind/models/llama/LlamaDenseWeight.h" #include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/utils/Tensor.h"
namespace turbomind { namespace turbomind {
...@@ -43,6 +44,8 @@ public: ...@@ -43,6 +44,8 @@ public:
void loadModel(std::string dir_path, FtCudaDataType model_file_type); void loadModel(std::string dir_path, FtCudaDataType model_file_type);
TensorMap getParams(std::string prefix);
T* self_attn_norm_weights{}; T* self_attn_norm_weights{};
T* ffn_norm_weights{}; T* ffn_norm_weights{};
LlamaAttentionWeight<T> self_attn_weights{}; LlamaAttentionWeight<T> self_attn_weights{};
......
...@@ -109,6 +109,35 @@ void LlamaWeight<T>::loadModel(std::string dir_path) ...@@ -109,6 +109,35 @@ void LlamaWeight<T>::loadModel(std::string dir_path)
} }
} }
template<typename T>
TensorMap LlamaWeight<T>::getParams()
{
TensorMap output;
output.insert(
"tok_embeddings.weight",
Tensor{MEMORY_GPU, getTensorType<T>(), {vocab_size_ * hidden_units_ * sizeof(T)}, pre_decoder_embedding_table});
output.insert("norm.weight",
Tensor{MEMORY_GPU, getTensorType<T>(), {hidden_units_ * sizeof(T)}, output_norm_weight});
output.insert(
"output.weight",
Tensor{
MEMORY_GPU, getTensorType<T>(), {hidden_units_ * vocab_size_ * sizeof(T)}, post_decoder_embedding_kernel});
// transformer layers
for (size_t i = 0; i < num_layer_; i++) {
std::string prefix = fmtstr("layers.%d", i);
TensorMap layeri = decoder_layer_weights[i]->getParams(prefix);
for (auto [name, tensor] : layeri) {
output.insert(name, tensor);
}
}
return output;
}
template struct LlamaWeight<float>; template struct LlamaWeight<float>;
template struct LlamaWeight<half>; template struct LlamaWeight<half>;
......
...@@ -47,6 +47,8 @@ struct LlamaWeight { ...@@ -47,6 +47,8 @@ struct LlamaWeight {
void loadModel(std::string dir_path); void loadModel(std::string dir_path);
TensorMap getParams();
std::vector<LlamaDecoderLayerWeight<T>*> decoder_layer_weights; std::vector<LlamaDecoderLayerWeight<T>*> decoder_layer_weights;
const T* pre_decoder_embedding_table{}; const T* pre_decoder_embedding_table{};
const T* output_norm_weight{}; const T* output_norm_weight{};
......
...@@ -282,6 +282,27 @@ PYBIND11_MODULE(_turbomind, m) ...@@ -282,6 +282,27 @@ PYBIND11_MODULE(_turbomind, m)
return new triton::Tensor(self->where, self->type, new_shape, self->data); return new triton::Tensor(self->where, self->type, new_shape, self->data);
}, },
"new_shape"_a) "new_shape"_a)
.def(
"copy_from",
[](triton::Tensor* self, py::object obj) {
py::capsule cap = obj.attr("__dlpack__")();
DLManagedTensor* dlmt =
static_cast<DLManagedTensor*>(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName));
auto src = DLManagedTensorToTritonTensor(dlmt);
if (self->type == triton::TYPE_FP16 || self->type == triton::TYPE_FP32
|| self->type == triton::TYPE_INT32) {
auto num_element =
std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies<int64_t>());
auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8;
ft::FT_CHECK(self->shape.size() == 1 && num_bytes == self->shape[0]);
cudaMemcpy(
const_cast<void*>(self->data), const_cast<void*>(src->data), num_bytes, cudaMemcpyDefault);
}
else {
ft::FT_CHECK(0);
}
},
"tensor"_a)
.def( .def(
"__dlpack__", "__dlpack__",
[](triton::Tensor* self, long stream) { [](triton::Tensor* self, long stream) {
...@@ -340,6 +361,7 @@ PYBIND11_MODULE(_turbomind, m) ...@@ -340,6 +361,7 @@ PYBIND11_MODULE(_turbomind, m)
.def_static( .def_static(
"create_llama_model", "create_llama_model",
[](std::string model_dir, [](std::string model_dir,
std::string config,
size_t tensor_para_size, size_t tensor_para_size,
size_t pipeline_para_size, size_t pipeline_para_size,
int enable_custom_all_reduce, int enable_custom_all_reduce,
...@@ -354,18 +376,19 @@ PYBIND11_MODULE(_turbomind, m) ...@@ -354,18 +376,19 @@ PYBIND11_MODULE(_turbomind, m)
}; };
if (data_type == "half" || data_type == "fp16" || data_type == "int4") { if (data_type == "half" || data_type == "fp16" || data_type == "int4") {
auto model = std::make_shared<LlamaTritonModel<half>>( auto model = std::make_shared<LlamaTritonModel<half>>(
tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir); tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config);
model->setFfiLock(gil_control); model->setFfiLock(gil_control);
return model; return model;
} }
else { else {
auto model = std::make_shared<LlamaTritonModel<float>>( auto model = std::make_shared<LlamaTritonModel<float>>(
tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir); tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config);
model->setFfiLock(gil_control); model->setFfiLock(gil_control);
return model; return model;
} }
}, },
"model_dir"_a, "model_dir"_a,
"config"_a = "",
"tensor_para_size"_a = 1, "tensor_para_size"_a = 1,
"pipeline_para_size"_a = 1, "pipeline_para_size"_a = 1,
"enable_custom_all_reduce"_a = 0, "enable_custom_all_reduce"_a = 0,
...@@ -406,6 +429,15 @@ PYBIND11_MODULE(_turbomind, m) ...@@ -406,6 +429,15 @@ PYBIND11_MODULE(_turbomind, m)
py::call_guard<py::gil_scoped_release>(), py::call_guard<py::gil_scoped_release>(),
"device_id"_a, "device_id"_a,
"rank"_a) "rank"_a)
.def(
"get_params",
[](AbstractTransformerModel* model, int deviceId, int rank) {
TensorMap output = model->getParams(deviceId, rank);
return output;
},
py::call_guard<py::gil_scoped_release>(),
"device_id"_a,
"rank"_a)
.def("__str__", &AbstractTransformerModel::toString) .def("__str__", &AbstractTransformerModel::toString)
.def("__repr__", &AbstractTransformerModel::toString) .def("__repr__", &AbstractTransformerModel::toString)
.def("get_tensor_para_size", &AbstractTransformerModel::getTensorParaSize) .def("get_tensor_para_size", &AbstractTransformerModel::getTensorParaSize)
......
...@@ -111,18 +111,35 @@ template<typename T> ...@@ -111,18 +111,35 @@ template<typename T>
LlamaTritonModel<T>::LlamaTritonModel(size_t tensor_para_size, LlamaTritonModel<T>::LlamaTritonModel(size_t tensor_para_size,
size_t pipeline_para_size, size_t pipeline_para_size,
int enable_custom_all_reduce, int enable_custom_all_reduce,
std::string model_dir): std::string model_dir,
std::string config):
tensor_para_size_(tensor_para_size), tensor_para_size_(tensor_para_size),
pipeline_para_size_(pipeline_para_size), pipeline_para_size_(pipeline_para_size),
shared_weights_(std::vector<std::shared_ptr<ft::LlamaWeight<T>>>(ft::getDeviceCount())), shared_weights_(std::vector<std::shared_ptr<ft::LlamaWeight<T>>>(ft::getDeviceCount())),
enable_custom_all_reduce_(enable_custom_all_reduce) enable_custom_all_reduce_(enable_custom_all_reduce)
{ {
model_dir_ = model_dir; INIReader reader;
const std::string inifile{model_dir + "/config.ini"}; FT_CHECK_WITH_INFO((config.empty() ^ model_dir.empty()), "invalid init options");
INIReader reader = INIReader(inifile);
if (reader.ParseError() < 0) { if (!config.empty()) {
std::cout << "[ERROR] Can't load '" << inifile << "'\n"; std::FILE* tmpf = std::tmpfile();
ft::FT_CHECK(false); std::fputs(config.c_str(), tmpf);
std::rewind(tmpf);
reader = INIReader(tmpf);
if (reader.ParseError() < 0) {
TM_LOG_ERROR("[ERROR] Can't init with config %s", config.c_str());
ft::FT_CHECK(false);
}
}
if (!model_dir.empty()) {
model_dir_ = model_dir;
const std::string inifile{model_dir + "/config.ini"};
reader = INIReader(inifile);
if (reader.ParseError() < 0) {
TM_LOG_ERROR("[ERROR] Can't load %s", inifile.c_str());
ft::FT_CHECK(false);
}
} }
model_name_ = reader.Get("llama", "model_name"); model_name_ = reader.Get("llama", "model_name");
...@@ -154,7 +171,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t tensor_para_size, ...@@ -154,7 +171,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t tensor_para_size,
attn_params_.rope_scaling_factor = reader.GetFloat("llama", "rope_scaling_factor", 0.f); attn_params_.rope_scaling_factor = reader.GetFloat("llama", "rope_scaling_factor", 0.f);
attn_params_.max_position_embeddings = reader.GetInteger("llama", "max_position_embeddings", 0); attn_params_.max_position_embeddings = reader.GetInteger("llama", "max_position_embeddings", 0);
// attn_params_.use_dynamic_ntk = reader.GetInteger("llama", "use_dynamic_ntk", 0); // attn_params_.use_dynamic_ntk = reader.GetInteger("llama", "use_dynamic_ntk", 0);
attn_params_.use_logn_attn = reader.GetInteger("llama", "use_logn_attn", 0); attn_params_.use_logn_attn = reader.GetInteger("llama", "use_logn_attn", 0);
handleMissingParams(); handleMissingParams();
...@@ -322,10 +339,27 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank) ...@@ -322,10 +339,27 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
group_size_, group_size_,
tensor_para_size_, tensor_para_size_,
tensor_para_rank); tensor_para_rank);
shared_weights_[device_id]->loadModel(model_dir_); // model inited with model_dir
if (model_dir_ != "") {
shared_weights_[device_id]->loadModel(model_dir_);
}
return; return;
} }
template<typename T>
TensorMap LlamaTritonModel<T>::getParams(int deviceId, int rank)
{
ft::check_cuda_error(cudaSetDevice(deviceId));
// shared_weight should be created before getParams
ft::FT_CHECK(shared_weights_[deviceId] != nullptr);
ft::TensorMap output = shared_weights_[deviceId]->getParams();
TensorMap result;
for (auto [name, tensor] : output) {
result.emplace(name, triton::Tensor{tensor.where, tensor.type, tensor.shape, tensor.data});
}
return result;
}
template<typename T> template<typename T>
std::string LlamaTritonModel<T>::toString() std::string LlamaTritonModel<T>::toString()
{ {
......
...@@ -40,7 +40,8 @@ struct LlamaTritonModel: public AbstractTransformerModel { ...@@ -40,7 +40,8 @@ struct LlamaTritonModel: public AbstractTransformerModel {
LlamaTritonModel(size_t tensor_para_size, LlamaTritonModel(size_t tensor_para_size,
size_t pipeline_para_size, size_t pipeline_para_size,
int enable_custom_all_reduce, int enable_custom_all_reduce,
std::string model_dir); std::string model_dir,
std::string config = "");
~LlamaTritonModel() = default; ~LlamaTritonModel() = default;
...@@ -53,6 +54,8 @@ struct LlamaTritonModel: public AbstractTransformerModel { ...@@ -53,6 +54,8 @@ struct LlamaTritonModel: public AbstractTransformerModel {
void createSharedWeights(int deviceId, int rank) override; void createSharedWeights(int deviceId, int rank) override;
TensorMap getParams(int deviceId, int rank) override;
void createCustomComms(std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms, void createCustomComms(std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms,
int world_size) override; int world_size) override;
......
...@@ -301,6 +301,8 @@ struct AbstractTransformerModelInstance { ...@@ -301,6 +301,8 @@ struct AbstractTransformerModelInstance {
void* stream_ctx_ = nullptr; void* stream_ctx_ = nullptr;
}; };
using TensorMap = std::unordered_map<std::string, triton::Tensor>;
struct AbstractTransformerModel { struct AbstractTransformerModel {
static std::shared_ptr<AbstractTransformerModel> createLlamaModel(std::string model_dir); static std::shared_ptr<AbstractTransformerModel> createLlamaModel(std::string model_dir);
...@@ -324,6 +326,8 @@ struct AbstractTransformerModel { ...@@ -324,6 +326,8 @@ struct AbstractTransformerModel {
virtual void createSharedWeights(int deviceId, int rank) = 0; virtual void createSharedWeights(int deviceId, int rank) = 0;
virtual TensorMap getParams(int deviceId, int rank) = 0;
virtual std::string toString() = 0; virtual std::string toString() = 0;
virtual int getTensorParaSize() = 0; virtual int getTensorParaSize() = 0;
virtual int getPipelineParaSize() = 0; virtual int getPipelineParaSize() = 0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment