Support loading hf model directly (#685)

* turbomind support export model params * fix overflow * support turbomind.from_pretrained * fix tp * support AutoModel * support load kv qparams * update auto_awq * udpate docstring * export lmdeploy version * update doc * remove download_hf_repo * LmdeployForCausalLM -> LmdeployForCausalLM * refactor turbomind.py * update comment * add bfloat16 convert back * support gradio run_locl load hf * support resuful api server load hf * add docs * support loading previous quantized model * adapt pr 690 * udpate docs * not export turbomind config when quantize a model * check model_name when can not get it from config.json * update readme * remove model_name in auto_awq * update * update * udpate * fix build * absolute import

Support loading hf model directly (#685)
* turbomind support export model params * fix overflow * support turbomind.from_pretrained * fix tp * support AutoModel * support load kv qparams * update auto_awq * udpate docstring * export lmdeploy version * update doc * remove download_hf_repo * LmdeployForCausalLM -> LmdeployForCausalLM * refactor turbomind.py * update comment * add bfloat16 convert back * support gradio run_locl load hf * support resuful api server load hf * add docs * support loading previous quantized model * adapt pr 690 * udpate docs * not export turbomind config when quantize a model * check model_name when can not get it from config.json * update readme * remove model_name in auto_awq * update * update * udpate * fix build * absolute import
6b00f623 · Chen Xin · GitHub · 42e57c8b · 6b00f623 · 6b00f623
Unverified Commit 6b00f623 authored Nov 22, 2023 by Chen Xin Committed by GitHub Nov 22, 2023
9 changed files
--- a/lmdeploy/turbomind/utils.py
+++ b/lmdeploy/turbomind/utils.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import dataclasses
+import json
+import logging
+import os
+from huggingface_hub import hf_hub_download
+from transformers.utils import ExplicitEnum
+logger = logging.getLogger(__name__)
+class ModelSource(ExplicitEnum):
+    """Turbomind model source."""
+    WORKSPACE = 'workspace'
+    HF_MODEL = 'hf_model'
+    HF_LMDEPLOY = 'hf_lmdeploy'
+def create_hf_download_args(**kwargs) -> dict:
+    download_kwargs = {
+        'revision': None,
+        'cache_dir': None,
+        'proxies': None,
+        'resume_download': True,
+        'force_download': False,
+        'token': None,
+        'local_files_only': False
+    }
+    for k in download_kwargs.keys():
+        if k in kwargs:
+            download_kwargs[k] = kwargs[k]
+    return download_kwargs
+def get_hf_config_path(pretrained_model_name_or_path, **kwargs) -> str:
+    """Get local hf config local file path."""
+    if os.path.exists(pretrained_model_name_or_path):
+        config_path = os.path.join(pretrained_model_name_or_path,
+                                   'config.json')
+    else:
+        download_kwargs = create_hf_download_args(**kwargs)
+        config_path = hf_hub_download(pretrained_model_name_or_path,
+                                      'config.json', **download_kwargs)
+    return config_path
+def get_hf_config_content(pretrained_model_name_or_path, **kwargs) -> dict:
+    """Get config content of a hf model."""
+    config_path = get_hf_config_path(pretrained_model_name_or_path, **kwargs)
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+    return config
+def get_model_source(pretrained_model_name_or_path: str,
+                     **kwargs) -> ModelSource:
+    """Get model source."""
+    triton_model_path = os.path.join(pretrained_model_name_or_path,
+                                     'triton_models')
+    if os.path.exists(triton_model_path):
+        return ModelSource.WORKSPACE
+    config = get_hf_config_content(pretrained_model_name_or_path, **kwargs)
+    model_source = ModelSource.HF_LMDEPLOY if 'turbomind' in config \
+        else ModelSource.HF_MODEL
+    return model_source
+def check_tm_model_input(pretrained_model_name_or_path, **kwargs):
+    """Check if single input pretrained_model_name_or_path is enough to use."""
+    if kwargs.get('model_name', None):
+        return
+    model_source = get_model_source(pretrained_model_name_or_path, **kwargs)
+    if model_source == ModelSource.WORKSPACE:
+        return
+    config = get_hf_config_content(pretrained_model_name_or_path, **kwargs)
+    if 'turbomind' in config and config['turbomind']['model_name'] != '':
+        return
+    assert (0), '\nCan not get model name from input model, '\
+        'please supply model name with arg --model-name,' \
+        'you can list supported models by `lmdeploy list`'
+@dataclasses.dataclass
+class GenParam:
+    top_p: float
+    top_k: float
+    temperature: float
+    repetition_penalty: float
+    sequence_start: bool = False
+    sequence_end: bool = False
+    step: int = 0
+    request_output_len: int = 512
+def get_gen_param(cap,
+                  sampling_param,
+                  nth_round,
+                  step,
+                  request_output_len=512,
+                  **kwargs):
+    """return parameters used by token generation."""
+    gen_param = GenParam(**dataclasses.asdict(sampling_param),
+                         request_output_len=request_output_len)
+    # Fix me later. turbomind.py doesn't support None top_k
+    if gen_param.top_k is None:
+        gen_param.top_k = 40
+    if cap == 'chat':
+        gen_param.sequence_start = (nth_round == 1)
+        gen_param.sequence_end = False
+        gen_param.step = step
+    else:
+        gen_param.sequence_start = True
+        gen_param.sequence_end = True
+        gen_param.step = 0
+    return gen_param
--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -110,6 +110,47 @@ void mallocWeights(LlamaDenseWeight<T>& weights, bool bias)
    }
 }
+template<typename FirstArg, typename... Args>
+std::string concat(FirstArg&& first, Args&&... args)
+{
+    std::stringstream stream;
+    stream << first;
+    ((stream << "." << args), ...);
+    return stream.str();
+}
+template<typename T>
+void getWeightTensor(LlamaDenseWeight<T>& weights, bool bias, const std::string& prefix, TensorMap& output)
+{
+    auto get_name = [=](const std::string& name) { return concat(prefix, name); };
+    if (bias) {
+        output.insert(get_name("bias"),
+                      Tensor{MEMORY_GPU, getTensorType<T>(), {weights.output_dims * sizeof(T)}, weights.bias});
+    }
+    const size_t bit_size = getBitSize(weights.type);
+    if (bit_size >= 16) {
+        output.insert(get_name("weight"),
+                      Tensor{MEMORY_GPU,
+                             getTensorType<T>(),
+                             {weights.input_dims * weights.output_dims * sizeof(T)},
+                             weights.kernel});
+    }
+    else {  // int8, int4
+        const int factor = sizeof(float) * 8 / bit_size;
+        output.insert(get_name("qweight"),
+                      Tensor{MEMORY_GPU,
+                             TYPE_INT32,
+                             {weights.input_dims * weights.output_dims * sizeof(int) / factor},
+                             weights.kernel});
+        output.insert(get_name("scales_zeros"),
+                      Tensor{MEMORY_GPU,
+                             getTensorType<T>(),
+                             {weights.input_dims / weights.group_size * weights.output_dims * 2 * sizeof(T)},
+                             weights.scales_and_zeros});
+    }
+}
 template<typename T>
 void loadWeights(LlamaDenseWeight<T>& w,
                 std::string          prefix,
@@ -226,6 +267,7 @@ void LlamaDecoderLayerWeight<T>::mallocWeights()
    turbomind::mallocWeights(self_attn_weights.qkv, attn_bias_);
    turbomind::mallocWeights(self_attn_weights.output, attn_bias_);
+    self_attn_weights.past_kv_scale = {1.f, 0.f, 1.f, 0.f};
    if (weight_type_ == WeightType::kINT4) {
        turbomind::mallocWeights(ffn_weights.fused_gating_intermediate, false);
@@ -294,16 +336,43 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
    loadWeights(ffn_weights.output, dir_path + ".feed_forward.w2", tensor_para_rank_, type, tensor_para_size_, 0);
    // load kv_cache quant scale
-    // if file not exist, get empty vector
    std::string   scale_path = dir_path + ".past_kv_scale." + rank_spec + ".weight";
    std::ifstream in(scale_path, std::ios::in);
    if (in.is_open()) {
        in.close();
        self_attn_weights.past_kv_scale = loadArrayFromBin({4}, scale_path);
    }
+}
+template<typename T>
+TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
+{
+    TensorMap output;
+    output.insert(concat(prefix, "attention_norm.weight"),
+                  Tensor{MEMORY_GPU, getTensorType<T>(), {hidden_units_ * sizeof(T)}, self_attn_norm_weights});
+    output.insert(concat(prefix, "ffn_norm.weight"),
+                  Tensor{MEMORY_GPU, getTensorType<T>(), {hidden_units_ * sizeof(T)}, ffn_norm_weights});
+    auto get_prefix = [=](std::string_view name) { return concat(prefix, name, tensor_para_rank_); };
+    getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output);
+    getWeightTensor(self_attn_weights.output, attn_bias_, get_prefix("attention.wo"), output);
+    if (weight_type_ == WeightType::kINT4) {
+        getWeightTensor(ffn_weights.fused_gating_intermediate, false, get_prefix("feed_forward.w13"), output);
+    }
    else {
-        self_attn_weights.past_kv_scale = {1.f, 0.f, 1.f, 0.f};
+        getWeightTensor(ffn_weights.gating, false, get_prefix("feed_forward.w1"), output);
+        getWeightTensor(ffn_weights.intermediate, false, get_prefix("feed_forward.w3"), output);
    }
+    getWeightTensor(ffn_weights.output, false, get_prefix("feed_forward.w2"), output);
+    output.insert(concat(prefix, "past_kv_scale", tensor_para_rank_, "weight"),
+                  Tensor{MEMORY_CPU, TYPE_FP32, {4 * sizeof(float)}, self_attn_weights.past_kv_scale.data()});
+    return output;
 }
 template struct LlamaDecoderLayerWeight<float>;

--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
@@ -21,6 +21,7 @@
 #pragma once
 #include "src/turbomind/models/llama/LlamaDenseWeight.h"
+#include "src/turbomind/utils/Tensor.h"
 namespace turbomind {
@@ -43,6 +44,8 @@ public:
    void loadModel(std::string dir_path, FtCudaDataType model_file_type);
+    TensorMap getParams(std::string prefix);
    T*                      self_attn_norm_weights{};
    T*                      ffn_norm_weights{};
    LlamaAttentionWeight<T> self_attn_weights{};

--- a/src/turbomind/models/llama/LlamaWeight.cc
+++ b/src/turbomind/models/llama/LlamaWeight.cc
@@ -109,6 +109,35 @@ void LlamaWeight<T>::loadModel(std::string dir_path)
    }
 }
+template<typename T>
+TensorMap LlamaWeight<T>::getParams()
+{
+    TensorMap output;
+    output.insert(
+        "tok_embeddings.weight",
+        Tensor{MEMORY_GPU, getTensorType<T>(), {vocab_size_ * hidden_units_ * sizeof(T)}, pre_decoder_embedding_table});
+    output.insert("norm.weight",
+                  Tensor{MEMORY_GPU, getTensorType<T>(), {hidden_units_ * sizeof(T)}, output_norm_weight});
+    output.insert(
+        "output.weight",
+        Tensor{
+            MEMORY_GPU, getTensorType<T>(), {hidden_units_ * vocab_size_ * sizeof(T)}, post_decoder_embedding_kernel});
+    // transformer layers
+    for (size_t i = 0; i < num_layer_; i++) {
+        std::string prefix = fmtstr("layers.%d", i);
+        TensorMap   layeri = decoder_layer_weights[i]->getParams(prefix);
+        for (auto [name, tensor] : layeri) {
+            output.insert(name, tensor);
+        }
+    }
+    return output;
+}
 template struct LlamaWeight<float>;
 template struct LlamaWeight<half>;

--- a/src/turbomind/models/llama/LlamaWeight.h
+++ b/src/turbomind/models/llama/LlamaWeight.h
@@ -47,6 +47,8 @@ struct LlamaWeight {
    void loadModel(std::string dir_path);
+    TensorMap getParams();
    std::vector<LlamaDecoderLayerWeight<T>*> decoder_layer_weights;
    const T*                                 pre_decoder_embedding_table{};
    const T*                                 output_norm_weight{};

--- a/src/turbomind/python/bind.cpp
+++ b/src/turbomind/python/bind.cpp
@@ -282,6 +282,27 @@ PYBIND11_MODULE(_turbomind, m)
                return new triton::Tensor(self->where, self->type, new_shape, self->data);
            },
            "new_shape"_a)
+        .def(
+            "copy_from",
+            [](triton::Tensor* self, py::object obj) {
+                py::capsule      cap = obj.attr("__dlpack__")();
+                DLManagedTensor* dlmt =
+                    static_cast<DLManagedTensor*>(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName));
+                auto src = DLManagedTensorToTritonTensor(dlmt);
+                if (self->type == triton::TYPE_FP16 || self->type == triton::TYPE_FP32
+                    || self->type == triton::TYPE_INT32) {
+                    auto num_element =
+                        std::accumulate(src->shape.begin(), src->shape.end(), 1LL, std::multiplies<int64_t>());
+                    auto num_bytes = num_element * dlmt->dl_tensor.dtype.bits / 8;
+                    ft::FT_CHECK(self->shape.size() == 1 && num_bytes == self->shape[0]);
+                    cudaMemcpy(
+                        const_cast<void*>(self->data), const_cast<void*>(src->data), num_bytes, cudaMemcpyDefault);
+                }
+                else {
+                    ft::FT_CHECK(0);
+                }
+            },
+            "tensor"_a)
        .def(
            "__dlpack__",
            [](triton::Tensor* self, long stream) {
@@ -340,6 +361,7 @@ PYBIND11_MODULE(_turbomind, m)
        .def_static(
            "create_llama_model",
            [](std::string model_dir,
+               std::string config,
               size_t      tensor_para_size,
               size_t      pipeline_para_size,
               int         enable_custom_all_reduce,
@@ -354,18 +376,19 @@ PYBIND11_MODULE(_turbomind, m)
                };
                if (data_type == "half" || data_type == "fp16" || data_type == "int4") {
                    auto model = std::make_shared<LlamaTritonModel<half>>(
-                        tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir);
+                        tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config);
                    model->setFfiLock(gil_control);
                    return model;
                }
                else {
                    auto model = std::make_shared<LlamaTritonModel<float>>(
-                        tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir);
+                        tensor_para_size, pipeline_para_size, enable_custom_all_reduce, model_dir, config);
                    model->setFfiLock(gil_control);
                    return model;
                }
            },
            "model_dir"_a,
+            "config"_a                   = "",
            "tensor_para_size"_a         = 1,
            "pipeline_para_size"_a       = 1,
            "enable_custom_all_reduce"_a = 0,
@@ -406,6 +429,15 @@ PYBIND11_MODULE(_turbomind, m)
             py::call_guard<py::gil_scoped_release>(),
             "device_id"_a,
             "rank"_a)
+        .def(
+            "get_params",
+            [](AbstractTransformerModel* model, int deviceId, int rank) {
+                TensorMap output = model->getParams(deviceId, rank);
+                return output;
+            },
+            py::call_guard<py::gil_scoped_release>(),
+            "device_id"_a,
+            "rank"_a)
        .def("__str__", &AbstractTransformerModel::toString)
        .def("__repr__", &AbstractTransformerModel::toString)
        .def("get_tensor_para_size", &AbstractTransformerModel::getTensorParaSize)

--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -111,18 +111,35 @@ template<typename T>
 LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
                                      size_t      pipeline_para_size,
                                      int         enable_custom_all_reduce,
-                                      std::string model_dir):
+                                      std::string model_dir,
+                                      std::string config):
    tensor_para_size_(tensor_para_size),
    pipeline_para_size_(pipeline_para_size),
    shared_weights_(std::vector<std::shared_ptr<ft::LlamaWeight<T>>>(ft::getDeviceCount())),
    enable_custom_all_reduce_(enable_custom_all_reduce)
 {
-    model_dir_ = model_dir;
+    INIReader reader;
-    const std::string inifile{model_dir + "/config.ini"};
+    FT_CHECK_WITH_INFO((config.empty() ^ model_dir.empty()), "invalid init options");
-    INIReader         reader = INIReader(inifile);
-    if (reader.ParseError() < 0) {
+    if (!config.empty()) {
-        std::cout << "[ERROR] Can't load '" << inifile << "'\n";
+        std::FILE* tmpf = std::tmpfile();
-        ft::FT_CHECK(false);
+        std::fputs(config.c_str(), tmpf);
+        std::rewind(tmpf);
+        reader = INIReader(tmpf);
+        if (reader.ParseError() < 0) {
+            TM_LOG_ERROR("[ERROR] Can't init with config %s", config.c_str());
+            ft::FT_CHECK(false);
+        }
+    }
+    if (!model_dir.empty()) {
+        model_dir_ = model_dir;
+        const std::string inifile{model_dir + "/config.ini"};
+        reader = INIReader(inifile);
+        if (reader.ParseError() < 0) {
+            TM_LOG_ERROR("[ERROR] Can't load %s", inifile.c_str());
+            ft::FT_CHECK(false);
+        }
    }
    model_name_            = reader.Get("llama", "model_name");
@@ -154,7 +171,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
    attn_params_.rope_scaling_factor     = reader.GetFloat("llama", "rope_scaling_factor", 0.f);
    attn_params_.max_position_embeddings = reader.GetInteger("llama", "max_position_embeddings", 0);
    // attn_params_.use_dynamic_ntk         = reader.GetInteger("llama", "use_dynamic_ntk", 0);
-    attn_params_.use_logn_attn           = reader.GetInteger("llama", "use_logn_attn", 0);
+    attn_params_.use_logn_attn = reader.GetInteger("llama", "use_logn_attn", 0);
    handleMissingParams();
@@ -322,10 +339,27 @@ void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
                                                                      group_size_,
                                                                      tensor_para_size_,
                                                                      tensor_para_rank);
-    shared_weights_[device_id]->loadModel(model_dir_);
+    // model inited with model_dir
+    if (model_dir_ != "") {
+        shared_weights_[device_id]->loadModel(model_dir_);
+    }
    return;
 }
+template<typename T>
+TensorMap LlamaTritonModel<T>::getParams(int deviceId, int rank)
+{
+    ft::check_cuda_error(cudaSetDevice(deviceId));
+    // shared_weight should be created before getParams
+    ft::FT_CHECK(shared_weights_[deviceId] != nullptr);
+    ft::TensorMap output = shared_weights_[deviceId]->getParams();
+    TensorMap     result;
+    for (auto [name, tensor] : output) {
+        result.emplace(name, triton::Tensor{tensor.where, tensor.type, tensor.shape, tensor.data});
+    }
+    return result;
+}
 template<typename T>
 std::string LlamaTritonModel<T>::toString()
 {

--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.h
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
@@ -40,7 +40,8 @@ struct LlamaTritonModel: public AbstractTransformerModel {
    LlamaTritonModel(size_t      tensor_para_size,
                     size_t      pipeline_para_size,
                     int         enable_custom_all_reduce,
-                     std::string model_dir);
+                     std::string model_dir,
+                     std::string config = "");
    ~LlamaTritonModel() = default;
@@ -53,6 +54,8 @@ struct LlamaTritonModel: public AbstractTransformerModel {
    void createSharedWeights(int deviceId, int rank) override;
+    TensorMap getParams(int deviceId, int rank) override;
    void createCustomComms(std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms,
                           int                                                   world_size) override;

--- a/src/turbomind/triton_backend/transformer_triton_backend.hpp
+++ b/src/turbomind/triton_backend/transformer_triton_backend.hpp
@@ -301,6 +301,8 @@ struct AbstractTransformerModelInstance {
    void*              stream_ctx_ = nullptr;
 };
+using TensorMap = std::unordered_map<std::string, triton::Tensor>;
 struct AbstractTransformerModel {
    static std::shared_ptr<AbstractTransformerModel> createLlamaModel(std::string model_dir);
@@ -324,6 +326,8 @@ struct AbstractTransformerModel {
    virtual void createSharedWeights(int deviceId, int rank) = 0;
+    virtual TensorMap getParams(int deviceId, int rank) = 0;
    virtual std::string toString()            = 0;
    virtual int         getTensorParaSize()   = 0;
    virtual int         getPipelineParaSize() = 0;