1、修复turbomind从hf读模型时存在bf16不支持的问题；2、支持gcc7编译

2326380c · zhouxiang · 9b3cb662 · 2326380c · 2326380c · 2326380c
Commit 2326380c authored May 17, 2024 by zhouxiang
7 changed files
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -305,9 +305,9 @@ class TurboMind:
            data_type = 'int4'
            cfg.group_size = 128
        else:
-            output_format = update_output_format(cfg.model_name,
-                                                 inferred_model_format,
-                                                 model_path, output_format)
+            # output_format = update_output_format(cfg.model_name,
+            #                                      inferred_model_format,
+            #                                      model_path, output_format)
            data_type = output_format
            update_config_weight_type(output_format, cfg)


--- a/src/turbomind/models/llama/BlockManager.cc
+++ b/src/turbomind/models/llama/BlockManager.cc
@@ -78,7 +78,7 @@ bool BlockManager::Malloc()
        return false;
    }

-    auto ptr = (std::byte*)allocator_->malloc(block_size_ * chunk_size);
+    auto ptr = (uint8_t*)allocator_->malloc(block_size_ * chunk_size);
    if (!ptr) {
        return false;
    }

--- a/src/turbomind/models/llama/LlamaBatch.cc
+++ b/src/turbomind/models/llama/LlamaBatch.cc
@@ -321,7 +321,7 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& requests)
                    int    begin = ranges[i * 2];
                    int    end   = ranges[i * 2 + 1];
                    size_t count = (end - begin) * model_->hidden_units_ * sizeof(T);
-                    seq.input_embeddings.emplace_back((std::byte*)emb_tensor_ptr, (std::byte*)(emb_tensor_ptr + count));
+                    seq.input_embeddings.emplace_back((uint8_t*)emb_tensor_ptr, (uint8_t*)(emb_tensor_ptr + count));
                    seq.input_embedding_ranges.emplace_back(begin + seq.tokens.size(), end + seq.tokens.size());
                    emb_tensor_ptr += count;
                }
@@ -796,13 +796,13 @@ void LlamaBatch<T>::AllocatePersistantBuffer(size_t max_batch_size)
    h_end_ids_buf_ = (int*)allocator_->reMalloc(h_end_ids_buf_, sizeof(int) * max_batch_size, false, true);

    sampling_params_ = {
-        {"stop_words_list", (std::byte*)h_stop_words_, (std::byte*)d_stop_words_},
-        {"bad_words_list", (std::byte*)h_bad_words_, (std::byte*)d_bad_words_},
-        {"min_length", (std::byte*)h_min_length_, nullptr},
-        {"runtime_top_k", (std::byte*)h_runtime_top_k_, nullptr},
-        {"runtime_top_p", (std::byte*)h_runtime_top_p_, nullptr},
-        {"temperature", (std::byte*)h_temperature_, nullptr},
-        {"repetition_penalty", (std::byte*)h_repetition_penalty_, nullptr},
+        {"stop_words_list", (uint8_t*)h_stop_words_, (uint8_t*)d_stop_words_},
+        {"bad_words_list", (uint8_t*)h_bad_words_, (uint8_t*)d_bad_words_},
+        {"min_length", (uint8_t*)h_min_length_, nullptr},
+        {"runtime_top_k", (uint8_t*)h_runtime_top_k_, nullptr},
+        {"runtime_top_p", (uint8_t*)h_runtime_top_p_, nullptr},
+        {"temperature", (uint8_t*)h_temperature_, nullptr},
+        {"repetition_penalty", (uint8_t*)h_repetition_penalty_, nullptr},
    };

    for (auto& s : states_) {
@@ -1056,7 +1056,7 @@ void LlamaBatch<T>::InitializeSampling(const GenerationState& g)
                if (state_->requests[i]->inputs[rank_].isExist(name)) {
                    Tensor& src = state_->requests[i]->inputs[rank_].at(name);
                    FT_CHECK(ref.shape == src.shape);
-                    std::copy_n(src.getPtr<std::byte>(), size_in_bytes, h_ptr + size_in_bytes * i);
+                    std::copy_n(src.getPtr<uint8_t>(), size_in_bytes, h_ptr + size_in_bytes * i);
                }
            }
            if (d_ptr) {

--- a/src/turbomind/models/llama/LlamaBatch.h
+++ b/src/turbomind/models/llama/LlamaBatch.h
@@ -284,7 +284,7 @@ private:
    TensorMap inputs_;
    TensorMap outputs_;

-    std::vector<std::tuple<std::string, std::byte*, std::byte*>> sampling_params_;
+    std::vector<std::tuple<std::string, uint8_t*, uint8_t*>> sampling_params_;

    cudaStream_t     stream_{};
    cublasMMWrapper* cublas_wrapper_{};

--- a/src/turbomind/models/llama/SequenceManager.h
+++ b/src/turbomind/models/llama/SequenceManager.h
@@ -29,12 +29,12 @@ struct Sequence {
    mutable int cache_len = 0;

    // additional data kept round-to-round
-    mutable std::vector<std::byte> random_state;  // update by user
+    mutable std::vector<uint8_t> random_state;  // update by user

    mutable float rope_theta = 0.f;

    // embedding data
-    mutable std::vector<std::vector<std::byte>> input_embeddings;
+    mutable std::vector<std::vector<uint8_t>> input_embeddings;
    mutable std::vector<std::pair<int, int>>    input_embedding_ranges;

    explicit Sequence(uint64_t _id): id(_id) {}
@@ -99,7 +99,7 @@ public:

    [[nodiscard]] void* GetValPtr(int block_id)
    {
-        return (std::byte*)GetKeyPtr(block_id) + val_offset_;
+        return (uint8_t*)GetKeyPtr(block_id) + val_offset_;
    }

    int max_block_count() const noexcept

--- a/src/turbomind/utils/CMakeLists.txt
+++ b/src/turbomind/utils/CMakeLists.txt
@@ -115,4 +115,4 @@ endif()
 add_library(tensor STATIC Tensor.cc)
 #set_property(TARGET tensor PROPERTY POSITION_INDEPENDENT_CODE  ON)
 #set_property(TARGET tensor PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-target_link_libraries(tensor PUBLIC cuda_utils logger)
+target_link_libraries(tensor PUBLIC cuda_utils logger -lstdc++fs)
--- a/src/turbomind/utils/Tensor.cc
+++ b/src/turbomind/utils/Tensor.cc
@@ -22,7 +22,8 @@
 #include "stdlib.h"
 #include <cuda_fp16.h>
 #include <cuda_runtime_api.h>
-#include <filesystem>
+// #include <filesystem>
+#include <experimental/filesystem>
 #include <numeric>
 #include <stdlib.h>
 #include <string>
@@ -31,7 +32,8 @@
 #include <unordered_map>
 #include <vector>

-namespace fs = std::filesystem;
+// namespace fs = std::filesystem;
+namespace fs = std::experimental::filesystem;
 namespace turbomind {

 Tensor::Tensor():