Commit 2326380c authored by zhouxiang's avatar zhouxiang
Browse files

1、修复turbomind从hf读模型时存在bf16不支持的问题;2、支持gcc7编译

parent 9b3cb662
......@@ -305,9 +305,9 @@ class TurboMind:
data_type = 'int4'
cfg.group_size = 128
else:
output_format = update_output_format(cfg.model_name,
inferred_model_format,
model_path, output_format)
# output_format = update_output_format(cfg.model_name,
# inferred_model_format,
# model_path, output_format)
data_type = output_format
update_config_weight_type(output_format, cfg)
......
......@@ -78,7 +78,7 @@ bool BlockManager::Malloc()
return false;
}
auto ptr = (std::byte*)allocator_->malloc(block_size_ * chunk_size);
auto ptr = (uint8_t*)allocator_->malloc(block_size_ * chunk_size);
if (!ptr) {
return false;
}
......
......@@ -321,7 +321,7 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& requests)
int begin = ranges[i * 2];
int end = ranges[i * 2 + 1];
size_t count = (end - begin) * model_->hidden_units_ * sizeof(T);
seq.input_embeddings.emplace_back((std::byte*)emb_tensor_ptr, (std::byte*)(emb_tensor_ptr + count));
seq.input_embeddings.emplace_back((uint8_t*)emb_tensor_ptr, (uint8_t*)(emb_tensor_ptr + count));
seq.input_embedding_ranges.emplace_back(begin + seq.tokens.size(), end + seq.tokens.size());
emb_tensor_ptr += count;
}
......@@ -796,13 +796,13 @@ void LlamaBatch<T>::AllocatePersistantBuffer(size_t max_batch_size)
h_end_ids_buf_ = (int*)allocator_->reMalloc(h_end_ids_buf_, sizeof(int) * max_batch_size, false, true);
sampling_params_ = {
{"stop_words_list", (std::byte*)h_stop_words_, (std::byte*)d_stop_words_},
{"bad_words_list", (std::byte*)h_bad_words_, (std::byte*)d_bad_words_},
{"min_length", (std::byte*)h_min_length_, nullptr},
{"runtime_top_k", (std::byte*)h_runtime_top_k_, nullptr},
{"runtime_top_p", (std::byte*)h_runtime_top_p_, nullptr},
{"temperature", (std::byte*)h_temperature_, nullptr},
{"repetition_penalty", (std::byte*)h_repetition_penalty_, nullptr},
{"stop_words_list", (uint8_t*)h_stop_words_, (uint8_t*)d_stop_words_},
{"bad_words_list", (uint8_t*)h_bad_words_, (uint8_t*)d_bad_words_},
{"min_length", (uint8_t*)h_min_length_, nullptr},
{"runtime_top_k", (uint8_t*)h_runtime_top_k_, nullptr},
{"runtime_top_p", (uint8_t*)h_runtime_top_p_, nullptr},
{"temperature", (uint8_t*)h_temperature_, nullptr},
{"repetition_penalty", (uint8_t*)h_repetition_penalty_, nullptr},
};
for (auto& s : states_) {
......@@ -1056,7 +1056,7 @@ void LlamaBatch<T>::InitializeSampling(const GenerationState& g)
if (state_->requests[i]->inputs[rank_].isExist(name)) {
Tensor& src = state_->requests[i]->inputs[rank_].at(name);
FT_CHECK(ref.shape == src.shape);
std::copy_n(src.getPtr<std::byte>(), size_in_bytes, h_ptr + size_in_bytes * i);
std::copy_n(src.getPtr<uint8_t>(), size_in_bytes, h_ptr + size_in_bytes * i);
}
}
if (d_ptr) {
......
......@@ -284,7 +284,7 @@ private:
TensorMap inputs_;
TensorMap outputs_;
std::vector<std::tuple<std::string, std::byte*, std::byte*>> sampling_params_;
std::vector<std::tuple<std::string, uint8_t*, uint8_t*>> sampling_params_;
cudaStream_t stream_{};
cublasMMWrapper* cublas_wrapper_{};
......
......@@ -29,12 +29,12 @@ struct Sequence {
mutable int cache_len = 0;
// additional data kept round-to-round
mutable std::vector<std::byte> random_state; // update by user
mutable std::vector<uint8_t> random_state; // update by user
mutable float rope_theta = 0.f;
// embedding data
mutable std::vector<std::vector<std::byte>> input_embeddings;
mutable std::vector<std::vector<uint8_t>> input_embeddings;
mutable std::vector<std::pair<int, int>> input_embedding_ranges;
explicit Sequence(uint64_t _id): id(_id) {}
......@@ -99,7 +99,7 @@ public:
[[nodiscard]] void* GetValPtr(int block_id)
{
return (std::byte*)GetKeyPtr(block_id) + val_offset_;
return (uint8_t*)GetKeyPtr(block_id) + val_offset_;
}
int max_block_count() const noexcept
......
......@@ -115,4 +115,4 @@ endif()
add_library(tensor STATIC Tensor.cc)
#set_property(TARGET tensor PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET tensor PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(tensor PUBLIC cuda_utils logger)
target_link_libraries(tensor PUBLIC cuda_utils logger -lstdc++fs)
......@@ -22,7 +22,8 @@
#include "stdlib.h"
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <filesystem>
// #include <filesystem>
#include <experimental/filesystem>
#include <numeric>
#include <stdlib.h>
#include <string>
......@@ -31,7 +32,8 @@
#include <unordered_map>
#include <vector>
namespace fs = std::filesystem;
// namespace fs = std::filesystem;
namespace fs = std::experimental::filesystem;
namespace turbomind {
Tensor::Tensor():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment