Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Lmdeploy
Commits
cad70512
Commit
cad70512
authored
May 17, 2024
by
zhouxiang
Browse files
1、取出dcu不支持的依赖;2、支持gcc7
parent
89f614ad
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
28 additions
and
26 deletions
+28
-26
requirements/lite.txt
requirements/lite.txt
+1
-1
src/turbomind/models/llama/BlockManager.cc
src/turbomind/models/llama/BlockManager.cc
+1
-1
src/turbomind/models/llama/LlamaBatch.cc
src/turbomind/models/llama/LlamaBatch.cc
+8
-8
src/turbomind/models/llama/LlamaBatch.h
src/turbomind/models/llama/LlamaBatch.h
+1
-1
src/turbomind/models/llama/SequenceManager.h
src/turbomind/models/llama/SequenceManager.h
+3
-3
src/turbomind/utils/CMakeLists.txt
src/turbomind/utils/CMakeLists.txt
+1
-1
src/turbomind/utils/Tensor.cc
src/turbomind/utils/Tensor.cc
+4
-2
tests/csrc/unittests/test_sampling_kernels.cu
tests/csrc/unittests/test_sampling_kernels.cu
+9
-9
No files found.
requirements/lite.txt
View file @
cad70512
accelerate
datasets
flash-attn
#
flash-attn
src/turbomind/models/llama/BlockManager.cc
View file @
cad70512
...
...
@@ -61,7 +61,7 @@ bool BlockManager::Malloc()
return
false
;
}
auto
ptr
=
(
std
::
byte
*
)
allocator_
->
malloc
(
block_size_
*
chunk_size
);
auto
ptr
=
(
uint8_t
*
)
allocator_
->
malloc
(
block_size_
*
chunk_size
);
if
(
!
ptr
)
{
return
false
;
}
...
...
src/turbomind/models/llama/LlamaBatch.cc
View file @
cad70512
...
...
@@ -320,7 +320,7 @@ void LlamaBatch<T>::ProcessInferRequests(const Requests& requests)
int
begin
=
ranges
[
i
*
2
];
int
end
=
ranges
[
i
*
2
+
1
];
size_t
count
=
(
end
-
begin
)
*
model_
->
hidden_units_
*
sizeof
(
T
);
seq
.
input_embeddings
.
emplace_back
((
std
::
byte
*
)
emb_tensor_ptr
,
(
std
::
byte
*
)(
emb_tensor_ptr
+
count
));
seq
.
input_embeddings
.
emplace_back
((
uint8_t
*
)
emb_tensor_ptr
,
(
uint8_t
*
)(
emb_tensor_ptr
+
count
));
seq
.
input_embedding_ranges
.
emplace_back
(
begin
+
seq
.
tokens
.
size
(),
end
+
seq
.
tokens
.
size
());
emb_tensor_ptr
+=
count
;
}
...
...
@@ -789,12 +789,12 @@ void LlamaBatch<T>::AllocatePersistantBuffer(size_t max_batch_size)
h_end_ids_buf_
=
(
int
*
)
allocator_
->
reMalloc
(
h_end_ids_buf_
,
sizeof
(
int
)
*
max_batch_size
,
false
,
true
);
sampling_params_
=
{
{
"stop_words_list"
,
(
std
::
byte
*
)
h_stop_words_
,
(
std
::
byte
*
)
d_stop_words_
},
{
"bad_words_list"
,
(
std
::
byte
*
)
h_bad_words_
,
(
std
::
byte
*
)
d_bad_words_
},
{
"runtime_top_k"
,
(
std
::
byte
*
)
h_runtime_top_k_
,
nullptr
},
{
"runtime_top_p"
,
(
std
::
byte
*
)
h_runtime_top_p_
,
nullptr
},
{
"temperature"
,
(
std
::
byte
*
)
h_temperature_
,
nullptr
},
{
"repetition_penalty"
,
(
std
::
byte
*
)
h_repetition_penalty_
,
nullptr
},
{
"stop_words_list"
,
(
uint8_t
*
)
h_stop_words_
,
(
uint8_t
*
)
d_stop_words_
},
{
"bad_words_list"
,
(
uint8_t
*
)
h_bad_words_
,
(
uint8_t
*
)
d_bad_words_
},
{
"runtime_top_k"
,
(
uint8_t
*
)
h_runtime_top_k_
,
nullptr
},
{
"runtime_top_p"
,
(
uint8_t
*
)
h_runtime_top_p_
,
nullptr
},
{
"temperature"
,
(
uint8_t
*
)
h_temperature_
,
nullptr
},
{
"repetition_penalty"
,
(
uint8_t
*
)
h_repetition_penalty_
,
nullptr
},
};
for
(
auto
&
s
:
states_
)
{
...
...
@@ -1041,7 +1041,7 @@ void LlamaBatch<T>::InitializeSampling(const GenerationState& g)
if
(
state_
->
requests
[
i
]
->
inputs
[
rank_
].
isExist
(
name
))
{
Tensor
&
src
=
state_
->
requests
[
i
]
->
inputs
[
rank_
].
at
(
name
);
FT_CHECK
(
ref
.
shape
==
src
.
shape
);
std
::
copy_n
(
src
.
getPtr
<
std
::
byte
>
(),
size_in_bytes
,
h_ptr
+
size_in_bytes
*
i
);
std
::
copy_n
(
src
.
getPtr
<
uint8_t
>
(),
size_in_bytes
,
h_ptr
+
size_in_bytes
*
i
);
}
}
if
(
d_ptr
)
{
...
...
src/turbomind/models/llama/LlamaBatch.h
View file @
cad70512
...
...
@@ -280,7 +280,7 @@ private:
TensorMap
inputs_
;
TensorMap
outputs_
;
std
::
vector
<
std
::
tuple
<
std
::
string
,
std
::
byte
*
,
std
::
byte
*>>
sampling_params_
;
std
::
vector
<
std
::
tuple
<
std
::
string
,
uint8_t
*
,
uint8_t
*>>
sampling_params_
;
cudaStream_t
stream_
{};
cublasMMWrapper
*
cublas_wrapper_
{};
...
...
src/turbomind/models/llama/SequenceManager.h
View file @
cad70512
...
...
@@ -29,12 +29,12 @@ struct Sequence {
mutable
int
cache_len
=
0
;
// additional data kept round-to-round
mutable
std
::
vector
<
std
::
byte
>
random_state
;
// update by user
mutable
std
::
vector
<
uint8_t
>
random_state
;
// update by user
mutable
float
rope_theta
=
0.
f
;
// embedding data
mutable
std
::
vector
<
std
::
vector
<
std
::
byte
>>
input_embeddings
;
mutable
std
::
vector
<
std
::
vector
<
uint8_t
>>
input_embeddings
;
mutable
std
::
vector
<
std
::
pair
<
int
,
int
>>
input_embedding_ranges
;
explicit
Sequence
(
uint64_t
_id
)
:
id
(
_id
)
{}
...
...
@@ -98,7 +98,7 @@ public:
[[
nodiscard
]]
void
*
GetValPtr
(
int
block_id
)
{
return
(
std
::
byte
*
)
GetKeyPtr
(
block_id
)
+
val_offset_
;
return
(
uint8_t
*
)
GetKeyPtr
(
block_id
)
+
val_offset_
;
}
int
max_block_count
()
const
noexcept
...
...
src/turbomind/utils/CMakeLists.txt
View file @
cad70512
...
...
@@ -114,4 +114,4 @@ endif()
add_library
(
tensor STATIC Tensor.cc
)
#set_property(TARGET tensor PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET tensor PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
tensor PUBLIC cuda_utils logger
)
target_link_libraries
(
tensor PUBLIC cuda_utils logger
-lstdc++fs
)
src/turbomind/utils/Tensor.cc
View file @
cad70512
...
...
@@ -22,7 +22,8 @@
#include "stdlib.h"
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <filesystem>
//#include <filesystem>
#include <experimental/filesystem>
#include <numeric>
#include <stdlib.h>
#include <string>
...
...
@@ -31,7 +32,8 @@
#include <unordered_map>
#include <vector>
namespace
fs
=
std
::
filesystem
;
//namespace fs = std::filesystem;
namespace
fs
=
std
::
experimental
::
filesystem
;
namespace
turbomind
{
Tensor
::
Tensor
()
:
...
...
tests/csrc/unittests/test_sampling_kernels.cu
View file @
cad70512
...
...
@@ -804,15 +804,15 @@ TYPED_TEST(TopPSamplingKernelTest, BatchCorrectnessLargeP)
this
->
runBatchTest
({
6
,
4
,
1
,
0
,
0.9
f
,
1
});
};
TYPED_TEST
(
TopPSamplingKernelTest
,
BatchCorrectnessSmallP2
)
{
this
->
runBatchTest
({
8
,
4000
,
1
,
0
,
0.2
f
,
16
});
};
TYPED_TEST
(
TopPSamplingKernelTest
,
BatchCorrectnessLargeP2
)
{
this
->
runBatchTest
({
8
,
4000
,
1
,
0
,
0.9
f
,
16
});
};
//
TYPED_TEST(TopPSamplingKernelTest, BatchCorrectnessSmallP2)
//
{
//
this->runBatchTest({8, 4000, 1, 0, 0.2f, 16});
//
};
//
TYPED_TEST(TopPSamplingKernelTest, BatchCorrectnessLargeP2)
//
{
//
this->runBatchTest({8, 4000, 1, 0, 0.9f, 16});
//
};
__global__
void
generateRandomNumber
(
unsigned
int
*
vals
,
curandState_t
*
states
,
const
int
batch_size
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment