Commit 2326380c authored by zhouxiang's avatar zhouxiang
Browse files

1、修复turbomind从hf读模型时存在bf16不支持的问题;2、支持gcc7编译

parent 9b3cb662
...@@ -305,9 +305,9 @@ class TurboMind: ...@@ -305,9 +305,9 @@ class TurboMind:
data_type = 'int4' data_type = 'int4'
cfg.group_size = 128 cfg.group_size = 128
else: else:
output_format = update_output_format(cfg.model_name, # output_format = update_output_format(cfg.model_name,
inferred_model_format, # inferred_model_format,
model_path, output_format) # model_path, output_format)
data_type = output_format data_type = output_format
update_config_weight_type(output_format, cfg) update_config_weight_type(output_format, cfg)
......
...@@ -78,7 +78,7 @@ bool BlockManager::Malloc() ...@@ -78,7 +78,7 @@ bool BlockManager::Malloc()
return false; return false;
} }
auto ptr = (std::byte*)allocator_->malloc(block_size_ * chunk_size); auto ptr = (uint8_t*)allocator_->malloc(block_size_ * chunk_size);
if (!ptr) { if (!ptr) {
return false; return false;
} }
......
...@@ -321,7 +321,7 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& requests) ...@@ -321,7 +321,7 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& requests)
int begin = ranges[i * 2]; int begin = ranges[i * 2];
int end = ranges[i * 2 + 1]; int end = ranges[i * 2 + 1];
size_t count = (end - begin) * model_->hidden_units_ * sizeof(T); size_t count = (end - begin) * model_->hidden_units_ * sizeof(T);
seq.input_embeddings.emplace_back((std::byte*)emb_tensor_ptr, (std::byte*)(emb_tensor_ptr + count)); seq.input_embeddings.emplace_back((uint8_t*)emb_tensor_ptr, (uint8_t*)(emb_tensor_ptr + count));
seq.input_embedding_ranges.emplace_back(begin + seq.tokens.size(), end + seq.tokens.size()); seq.input_embedding_ranges.emplace_back(begin + seq.tokens.size(), end + seq.tokens.size());
emb_tensor_ptr += count; emb_tensor_ptr += count;
} }
...@@ -796,13 +796,13 @@ void LlamaBatch<T>::AllocatePersistantBuffer(size_t max_batch_size) ...@@ -796,13 +796,13 @@ void LlamaBatch<T>::AllocatePersistantBuffer(size_t max_batch_size)
h_end_ids_buf_ = (int*)allocator_->reMalloc(h_end_ids_buf_, sizeof(int) * max_batch_size, false, true); h_end_ids_buf_ = (int*)allocator_->reMalloc(h_end_ids_buf_, sizeof(int) * max_batch_size, false, true);
sampling_params_ = { sampling_params_ = {
{"stop_words_list", (std::byte*)h_stop_words_, (std::byte*)d_stop_words_}, {"stop_words_list", (uint8_t*)h_stop_words_, (uint8_t*)d_stop_words_},
{"bad_words_list", (std::byte*)h_bad_words_, (std::byte*)d_bad_words_}, {"bad_words_list", (uint8_t*)h_bad_words_, (uint8_t*)d_bad_words_},
{"min_length", (std::byte*)h_min_length_, nullptr}, {"min_length", (uint8_t*)h_min_length_, nullptr},
{"runtime_top_k", (std::byte*)h_runtime_top_k_, nullptr}, {"runtime_top_k", (uint8_t*)h_runtime_top_k_, nullptr},
{"runtime_top_p", (std::byte*)h_runtime_top_p_, nullptr}, {"runtime_top_p", (uint8_t*)h_runtime_top_p_, nullptr},
{"temperature", (std::byte*)h_temperature_, nullptr}, {"temperature", (uint8_t*)h_temperature_, nullptr},
{"repetition_penalty", (std::byte*)h_repetition_penalty_, nullptr}, {"repetition_penalty", (uint8_t*)h_repetition_penalty_, nullptr},
}; };
for (auto& s : states_) { for (auto& s : states_) {
...@@ -1056,7 +1056,7 @@ void LlamaBatch<T>::InitializeSampling(const GenerationState& g) ...@@ -1056,7 +1056,7 @@ void LlamaBatch<T>::InitializeSampling(const GenerationState& g)
if (state_->requests[i]->inputs[rank_].isExist(name)) { if (state_->requests[i]->inputs[rank_].isExist(name)) {
Tensor& src = state_->requests[i]->inputs[rank_].at(name); Tensor& src = state_->requests[i]->inputs[rank_].at(name);
FT_CHECK(ref.shape == src.shape); FT_CHECK(ref.shape == src.shape);
std::copy_n(src.getPtr<std::byte>(), size_in_bytes, h_ptr + size_in_bytes * i); std::copy_n(src.getPtr<uint8_t>(), size_in_bytes, h_ptr + size_in_bytes * i);
} }
} }
if (d_ptr) { if (d_ptr) {
......
...@@ -284,7 +284,7 @@ private: ...@@ -284,7 +284,7 @@ private:
TensorMap inputs_; TensorMap inputs_;
TensorMap outputs_; TensorMap outputs_;
std::vector<std::tuple<std::string, std::byte*, std::byte*>> sampling_params_; std::vector<std::tuple<std::string, uint8_t*, uint8_t*>> sampling_params_;
cudaStream_t stream_{}; cudaStream_t stream_{};
cublasMMWrapper* cublas_wrapper_{}; cublasMMWrapper* cublas_wrapper_{};
......
...@@ -29,12 +29,12 @@ struct Sequence { ...@@ -29,12 +29,12 @@ struct Sequence {
mutable int cache_len = 0; mutable int cache_len = 0;
// additional data kept round-to-round // additional data kept round-to-round
mutable std::vector<std::byte> random_state; // update by user mutable std::vector<uint8_t> random_state; // update by user
mutable float rope_theta = 0.f; mutable float rope_theta = 0.f;
// embedding data // embedding data
mutable std::vector<std::vector<std::byte>> input_embeddings; mutable std::vector<std::vector<uint8_t>> input_embeddings;
mutable std::vector<std::pair<int, int>> input_embedding_ranges; mutable std::vector<std::pair<int, int>> input_embedding_ranges;
explicit Sequence(uint64_t _id): id(_id) {} explicit Sequence(uint64_t _id): id(_id) {}
...@@ -99,7 +99,7 @@ public: ...@@ -99,7 +99,7 @@ public:
[[nodiscard]] void* GetValPtr(int block_id) [[nodiscard]] void* GetValPtr(int block_id)
{ {
return (std::byte*)GetKeyPtr(block_id) + val_offset_; return (uint8_t*)GetKeyPtr(block_id) + val_offset_;
} }
int max_block_count() const noexcept int max_block_count() const noexcept
......
...@@ -115,4 +115,4 @@ endif() ...@@ -115,4 +115,4 @@ endif()
add_library(tensor STATIC Tensor.cc) add_library(tensor STATIC Tensor.cc)
#set_property(TARGET tensor PROPERTY POSITION_INDEPENDENT_CODE ON) #set_property(TARGET tensor PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET tensor PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) #set_property(TARGET tensor PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(tensor PUBLIC cuda_utils logger) target_link_libraries(tensor PUBLIC cuda_utils logger -lstdc++fs)
...@@ -22,7 +22,8 @@ ...@@ -22,7 +22,8 @@
#include "stdlib.h" #include "stdlib.h"
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#include <filesystem> // #include <filesystem>
#include <experimental/filesystem>
#include <numeric> #include <numeric>
#include <stdlib.h> #include <stdlib.h>
#include <string> #include <string>
...@@ -31,7 +32,8 @@ ...@@ -31,7 +32,8 @@
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
namespace fs = std::filesystem; // namespace fs = std::filesystem;
namespace fs = std::experimental::filesystem;
namespace turbomind { namespace turbomind {
Tensor::Tensor(): Tensor::Tensor():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment