Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Lmdeploy
Commits
2326380c
Commit
2326380c
authored
May 17, 2024
by
zhouxiang
Browse files
1、修复turbomind从hf读模型时存在bf16不支持的问题;2、支持gcc7编译
parent
9b3cb662
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
22 additions
and
20 deletions
+22
-20
lmdeploy/turbomind/turbomind.py
lmdeploy/turbomind/turbomind.py
+3
-3
src/turbomind/models/llama/BlockManager.cc
src/turbomind/models/llama/BlockManager.cc
+1
-1
src/turbomind/models/llama/LlamaBatch.cc
src/turbomind/models/llama/LlamaBatch.cc
+9
-9
src/turbomind/models/llama/LlamaBatch.h
src/turbomind/models/llama/LlamaBatch.h
+1
-1
src/turbomind/models/llama/SequenceManager.h
src/turbomind/models/llama/SequenceManager.h
+3
-3
src/turbomind/utils/CMakeLists.txt
src/turbomind/utils/CMakeLists.txt
+1
-1
src/turbomind/utils/Tensor.cc
src/turbomind/utils/Tensor.cc
+4
-2
No files found.
lmdeploy/turbomind/turbomind.py
View file @
2326380c
...
...
@@ -305,9 +305,9 @@ class TurboMind:
data_type
=
'int4'
cfg
.
group_size
=
128
else
:
output_format
=
update_output_format
(
cfg
.
model_name
,
inferred_model_format
,
model_path
,
output_format
)
#
output_format = update_output_format(cfg.model_name,
#
inferred_model_format,
#
model_path, output_format)
data_type
=
output_format
update_config_weight_type
(
output_format
,
cfg
)
...
...
src/turbomind/models/llama/BlockManager.cc
View file @
2326380c
...
...
@@ -78,7 +78,7 @@ bool BlockManager::Malloc()
return
false
;
}
auto
ptr
=
(
std
::
byte
*
)
allocator_
->
malloc
(
block_size_
*
chunk_size
);
auto
ptr
=
(
uint8_t
*
)
allocator_
->
malloc
(
block_size_
*
chunk_size
);
if
(
!
ptr
)
{
return
false
;
}
...
...
src/turbomind/models/llama/LlamaBatch.cc
View file @
2326380c
...
...
@@ -321,7 +321,7 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& requests)
int
begin
=
ranges
[
i
*
2
];
int
end
=
ranges
[
i
*
2
+
1
];
size_t
count
=
(
end
-
begin
)
*
model_
->
hidden_units_
*
sizeof
(
T
);
seq
.
input_embeddings
.
emplace_back
((
std
::
byte
*
)
emb_tensor_ptr
,
(
std
::
byte
*
)(
emb_tensor_ptr
+
count
));
seq
.
input_embeddings
.
emplace_back
((
uint8_t
*
)
emb_tensor_ptr
,
(
uint8_t
*
)(
emb_tensor_ptr
+
count
));
seq
.
input_embedding_ranges
.
emplace_back
(
begin
+
seq
.
tokens
.
size
(),
end
+
seq
.
tokens
.
size
());
emb_tensor_ptr
+=
count
;
}
...
...
@@ -796,13 +796,13 @@ void LlamaBatch<T>::AllocatePersistantBuffer(size_t max_batch_size)
h_end_ids_buf_
=
(
int
*
)
allocator_
->
reMalloc
(
h_end_ids_buf_
,
sizeof
(
int
)
*
max_batch_size
,
false
,
true
);
sampling_params_
=
{
{
"stop_words_list"
,
(
std
::
byte
*
)
h_stop_words_
,
(
std
::
byte
*
)
d_stop_words_
},
{
"bad_words_list"
,
(
std
::
byte
*
)
h_bad_words_
,
(
std
::
byte
*
)
d_bad_words_
},
{
"min_length"
,
(
std
::
byte
*
)
h_min_length_
,
nullptr
},
{
"runtime_top_k"
,
(
std
::
byte
*
)
h_runtime_top_k_
,
nullptr
},
{
"runtime_top_p"
,
(
std
::
byte
*
)
h_runtime_top_p_
,
nullptr
},
{
"temperature"
,
(
std
::
byte
*
)
h_temperature_
,
nullptr
},
{
"repetition_penalty"
,
(
std
::
byte
*
)
h_repetition_penalty_
,
nullptr
},
{
"stop_words_list"
,
(
uint8_t
*
)
h_stop_words_
,
(
uint8_t
*
)
d_stop_words_
},
{
"bad_words_list"
,
(
uint8_t
*
)
h_bad_words_
,
(
uint8_t
*
)
d_bad_words_
},
{
"min_length"
,
(
uint8_t
*
)
h_min_length_
,
nullptr
},
{
"runtime_top_k"
,
(
uint8_t
*
)
h_runtime_top_k_
,
nullptr
},
{
"runtime_top_p"
,
(
uint8_t
*
)
h_runtime_top_p_
,
nullptr
},
{
"temperature"
,
(
uint8_t
*
)
h_temperature_
,
nullptr
},
{
"repetition_penalty"
,
(
uint8_t
*
)
h_repetition_penalty_
,
nullptr
},
};
for
(
auto
&
s
:
states_
)
{
...
...
@@ -1056,7 +1056,7 @@ void LlamaBatch<T>::InitializeSampling(const GenerationState& g)
if
(
state_
->
requests
[
i
]
->
inputs
[
rank_
].
isExist
(
name
))
{
Tensor
&
src
=
state_
->
requests
[
i
]
->
inputs
[
rank_
].
at
(
name
);
FT_CHECK
(
ref
.
shape
==
src
.
shape
);
std
::
copy_n
(
src
.
getPtr
<
std
::
byte
>
(),
size_in_bytes
,
h_ptr
+
size_in_bytes
*
i
);
std
::
copy_n
(
src
.
getPtr
<
uint8_t
>
(),
size_in_bytes
,
h_ptr
+
size_in_bytes
*
i
);
}
}
if
(
d_ptr
)
{
...
...
src/turbomind/models/llama/LlamaBatch.h
View file @
2326380c
...
...
@@ -284,7 +284,7 @@ private:
TensorMap
inputs_
;
TensorMap
outputs_
;
std
::
vector
<
std
::
tuple
<
std
::
string
,
std
::
byte
*
,
std
::
byte
*>>
sampling_params_
;
std
::
vector
<
std
::
tuple
<
std
::
string
,
uint8_t
*
,
uint8_t
*>>
sampling_params_
;
cudaStream_t
stream_
{};
cublasMMWrapper
*
cublas_wrapper_
{};
...
...
src/turbomind/models/llama/SequenceManager.h
View file @
2326380c
...
...
@@ -29,12 +29,12 @@ struct Sequence {
mutable
int
cache_len
=
0
;
// additional data kept round-to-round
mutable
std
::
vector
<
std
::
byte
>
random_state
;
// update by user
mutable
std
::
vector
<
uint8_t
>
random_state
;
// update by user
mutable
float
rope_theta
=
0.
f
;
// embedding data
mutable
std
::
vector
<
std
::
vector
<
std
::
byte
>>
input_embeddings
;
mutable
std
::
vector
<
std
::
vector
<
uint8_t
>>
input_embeddings
;
mutable
std
::
vector
<
std
::
pair
<
int
,
int
>>
input_embedding_ranges
;
explicit
Sequence
(
uint64_t
_id
)
:
id
(
_id
)
{}
...
...
@@ -99,7 +99,7 @@ public:
[[
nodiscard
]]
void
*
GetValPtr
(
int
block_id
)
{
return
(
std
::
byte
*
)
GetKeyPtr
(
block_id
)
+
val_offset_
;
return
(
uint8_t
*
)
GetKeyPtr
(
block_id
)
+
val_offset_
;
}
int
max_block_count
()
const
noexcept
...
...
src/turbomind/utils/CMakeLists.txt
View file @
2326380c
...
...
@@ -115,4 +115,4 @@ endif()
add_library
(
tensor STATIC Tensor.cc
)
#set_property(TARGET tensor PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET tensor PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
tensor PUBLIC cuda_utils logger
)
target_link_libraries
(
tensor PUBLIC cuda_utils logger
-lstdc++fs
)
src/turbomind/utils/Tensor.cc
View file @
2326380c
...
...
@@ -22,7 +22,8 @@
#include "stdlib.h"
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <filesystem>
// #include <filesystem>
#include <experimental/filesystem>
#include <numeric>
#include <stdlib.h>
#include <string>
...
...
@@ -31,7 +32,8 @@
#include <unordered_map>
#include <vector>
namespace
fs
=
std
::
filesystem
;
// namespace fs = std::filesystem;
namespace
fs
=
std
::
experimental
::
filesystem
;
namespace
turbomind
{
Tensor
::
Tensor
()
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment