Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Lmdeploy
Commits
981a4610
Unverified
Commit
981a4610
authored
Jul 31, 2023
by
Li Zhang
Committed by
GitHub
Jul 31, 2023
Browse files
[Fix] Remove unused code to reduce binary size (#181)
* clean-up * fix lint * fix lint
parent
83697422
Changes
41
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
2563 deletions
+0
-2563
src/turbomind/layers/FfnFP8Layer.cc
src/turbomind/layers/FfnFP8Layer.cc
+0
-535
src/turbomind/layers/FfnFP8Layer.h
src/turbomind/layers/FfnFP8Layer.h
+0
-133
src/turbomind/layers/FfnFP8Weight.h
src/turbomind/layers/FfnFP8Weight.h
+0
-30
src/turbomind/layers/FfnINT8Weight.h
src/turbomind/layers/FfnINT8Weight.h
+0
-28
src/turbomind/layers/FfnLayerINT8.cc
src/turbomind/layers/FfnLayerINT8.cc
+0
-340
src/turbomind/layers/FfnLayerINT8.h
src/turbomind/layers/FfnLayerINT8.h
+0
-146
src/turbomind/layers/attention_layers_fp8/AttentionFP8Weight.h
...urbomind/layers/attention_layers_fp8/AttentionFP8Weight.h
+0
-34
src/turbomind/layers/attention_layers_fp8/BaseAttentionFP8Layer.h
...omind/layers/attention_layers_fp8/BaseAttentionFP8Layer.h
+0
-65
src/turbomind/layers/attention_layers_fp8/CMakeLists.txt
src/turbomind/layers/attention_layers_fp8/CMakeLists.txt
+0
-15
src/turbomind/layers/attention_layers_int8/AttentionINT8Weight.h
...bomind/layers/attention_layers_int8/AttentionINT8Weight.h
+0
-29
src/turbomind/layers/attention_layers_int8/CMakeLists.txt
src/turbomind/layers/attention_layers_int8/CMakeLists.txt
+0
-15
src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.cu
...urbomind/layers/beam_search_layers/BaseBeamSearchLayer.cu
+0
-291
src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h
...turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h
+0
-80
src/turbomind/layers/beam_search_layers/BeamSearchLayer.cu
src/turbomind/layers/beam_search_layers/BeamSearchLayer.cu
+0
-354
src/turbomind/layers/beam_search_layers/BeamSearchLayer.h
src/turbomind/layers/beam_search_layers/BeamSearchLayer.h
+0
-68
src/turbomind/layers/beam_search_layers/CMakeLists.txt
src/turbomind/layers/beam_search_layers/CMakeLists.txt
+0
-30
src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.cu
...bomind/layers/beam_search_layers/OnlineBeamSearchLayer.cu
+0
-249
src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.h
...rbomind/layers/beam_search_layers/OnlineBeamSearchLayer.h
+0
-65
src/turbomind/models/llama/CMakeLists.txt
src/turbomind/models/llama/CMakeLists.txt
+0
-1
src/turbomind/models/llama/prefix_cache.cu
src/turbomind/models/llama/prefix_cache.cu
+0
-55
No files found.
src/turbomind/layers/FfnFP8Layer.cc
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/turbomind/layers/FfnFP8Layer.h"
#include "src/turbomind/kernels/activation_fp8_kernels.h"
#include "src/turbomind/utils/cublasFP8MMWrapper.h"
#include "src/turbomind/utils/nvtx_utils.h"
namespace
turbomind
{
template
<
typename
T1
,
typename
T2
>
void
FfnFP8Layer
<
T1
,
T2
>::
forward
(
TensorMap
*
output_tensors
,
TensorMap
*
input_tensors
,
const
FfnFP8Weight
<
T1
,
T2
>*
ffn_weights
)
{
// input tensors:
// input_hidden_state [token_num, d_model],
// output tensors:
// output_hidden_state [token_num, d_model],
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
FT_CHECK
(
input_tensors
->
size
()
==
1
);
FT_CHECK
(
output_tensors
->
size
()
==
1
);
const
int
m
=
input_tensors
->
at
(
"input_hidden_state"
).
shape
[
0
];
const
int
d_model
=
input_tensors
->
at
(
"input_hidden_state"
).
shape
[
1
];
const
T1
*
input_hidden_state
=
input_tensors
->
at
(
"input_hidden_state"
).
getPtr
<
T1
>
();
Tensor
output_tensor
=
output_tensors
->
at
(
"output_hidden_state"
);
allocateBuffer
(
m
);
#ifdef FUSE_GEMM_ACT
if
(
fp8_mode_
==
1
)
{
const
float
alpha
=
1.0
f
;
const
float
beta
=
0.0
f
;
reinterpret_cast
<
cublasFP8MMWrapper
*>
(
cublas_wrapper_
)
->
Gemm
(
inter_buf_bf16_
,
(
int
)
1
,
(
int
)
m
,
(
int
)
inter_size_
,
(
int
)
d_model
,
(
int64_t
)
0
,
(
int64_t
)
0
,
(
int64_t
)
0
,
&
alpha
,
&
beta
,
input_hidden_state
,
ffn_weights
->
intermediate_weight
.
kernel
,
ffn_weights
->
intermediate_weight
.
input_scale
,
ffn_weights
->
intermediate_weight
.
per_channel_scale_min
,
// identity_scale
stream_
);
invokeAddBiasActivation
(
m
,
ffn_weights
->
intermediate_weight
.
bias
,
ffn_weights
->
intermediate_weight
.
output_scale
,
ffn_weights
->
intermediate_weight
.
scale
,
ffn_weights
->
intermediate_weight
.
per_channel_scale_min
,
ffn_weights
->
output_weight
.
input_scale_inv
);
}
else
if
(
fp8_mode_
==
2
)
{
#ifdef USE_QGMMA
if
(
getActivationType
()
==
ActivationType
::
Gelu
)
{
PUSH_RANGE
(
"FFN gemm 1 bias gelu"
);
reinterpret_cast
<
cublasFP8MMWrapper
*>
(
cublas_wrapper_
)
->
Conv1x1Gemm
<
false
,
true
>
(
inter_buf_
,
m
,
inter_size_
,
d_model
,
input_hidden_state
,
ffn_weights
->
intermediate_weight
.
kernel
,
ffn_weights
->
intermediate_weight
.
bias
,
*
(
ffn_weights
->
intermediate_weight
.
input_h_scale
),
// scale_a,
*
(
ffn_weights
->
intermediate_weight
.
weight_h_scale
),
// scale_b,
*
(
ffn_weights
->
output_weight
.
input_h_scale_inv
),
// scale_d,
stream_
);
POP_RANGE
;
}
else
if
(
getActivationType
()
==
ActivationType
::
Relu
)
{
reinterpret_cast
<
cublasFP8MMWrapper
*>
(
cublas_wrapper_
)
->
Conv1x1Gemm
<
true
,
false
>
(
inter_buf_
,
m
,
inter_size_
,
d_model
,
input_hidden_state
,
ffn_weights
->
intermediate_weight
.
kernel
,
ffn_weights
->
intermediate_weight
.
bias
,
*
(
ffn_weights
->
intermediate_weight
.
input_h_scale
),
// scale_a,
*
(
ffn_weights
->
intermediate_weight
.
weight_h_scale
),
// scale_b,
*
(
ffn_weights
->
output_weight
.
input_h_scale_inv
),
// scale_d,
stream_
);
}
#else // USE_QGMMA
const
float
alpha
=
1.0
f
;
const
float
beta
=
0.0
f
;
if
(
getActivationType
()
==
ActivationType
::
Gelu
)
{
reinterpret_cast
<
cublasFP8MMWrapper
*>
(
cublas_wrapper_
)
#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
->
Gemm_Bias_Act
<
false
,
true
>
(
inter_buf_bf16_
,
#else // FP8_GEMM_OUTPUT_QUANT_DISABLE
->
Gemm_Bias_Act
<
false
,
true
>
(
inter_buf_
,
#endif // FP8_GEMM_OUTPUT_QUANT_DISABLE
(
int
)
1
,
(
int
)
m
,
(
int
)
inter_size_
,
(
int
)
d_model
,
(
int64_t
)
0
,
(
int64_t
)
0
,
(
int64_t
)
0
,
&
alpha
,
&
beta
,
input_hidden_state
,
ffn_weights
->
intermediate_weight
.
kernel
,
ffn_weights
->
intermediate_weight
.
input_scale
,
ffn_weights
->
intermediate_weight
.
weight_scale
,
ffn_weights
->
intermediate_weight
.
bias
,
ffn_weights
->
intermediate_weight
.
output_scale
,
stream_
);
}
else
if
(
getActivationType
()
==
ActivationType
::
Relu
)
{
reinterpret_cast
<
cublasFP8MMWrapper
*>
(
cublas_wrapper_
)
#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
->
Gemm_Bias_Act
<
true
,
false
>
(
inter_buf_bf16_
,
#else // FP8_GEMM_OUTPUT_QUANT_DISABLE
->
Gemm_Bias_Act
<
true
,
false
>
(
inter_buf_
,
#endif // #ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
(
int
)
1
,
(
int
)
m
,
(
int
)
inter_size_
,
(
int
)
d_model
,
(
int64_t
)
0
,
(
int64_t
)
0
,
(
int64_t
)
0
,
&
alpha
,
&
beta
,
input_hidden_state
,
ffn_weights
->
intermediate_weight
.
kernel
,
ffn_weights
->
intermediate_weight
.
input_scale
,
ffn_weights
->
intermediate_weight
.
weight_scale
,
ffn_weights
->
intermediate_weight
.
bias
,
ffn_weights
->
intermediate_weight
.
output_scale
,
stream_
);
}
#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
invokeQuantizeMatrix
<
T1
,
T2
,
QUANTIZE_MODE
::
PER_TENSOR
>
(
inter_buf_
,
ffn_weights
->
output_weight
.
input_scale_inv
,
inter_buf_bf16_
,
m
*
inter_size_
,
1
,
stream_
);
#endif FP8_GEMM_OUTPUT_QUANT_DISABLE
#endif // USE_QGMMA
}
#else // FUSE_GEMM_ACT
PUSH_RANGE
(
"FFN gemm 1"
);
#ifdef SPARSITY_ENABLED
int
m_tmp
=
m
;
if
(
m_tmp
%
8
!=
0
)
{
m_tmp
=
(
m_tmp
/
8
+
1
)
*
8
;
}
const
int
m_padded
=
m_tmp
;
if
(
sparse_
&&
cublas_wrapper_
->
isUseSparse
(
1
,
inter_size_
,
m
,
d_model
))
{
FT_CHECK
(
false
);
// cublas_wrapper_->SpGemm(CUBLAS_OP_N,
// CUBLAS_OP_N,
// inter_size_,
// m_padded,
// d_model,
// ffn_weights->intermediate_weight.sp_kernel,
// input_hidden_state,
// inter_buf_);
}
else
{
#endif // SPARSITY_ENABLED
if
(
fp8_mode_
==
1
)
{
const
float
alpha
=
1.0
f
;
const
float
beta
=
0.0
f
;
reinterpret_cast
<
cublasFP8MMWrapper
*>
(
cublas_wrapper_
)
->
Gemm
(
inter_buf_bf16_
,
(
int
)
1
,
(
int
)
m
,
(
int
)
inter_size_
,
(
int
)
d_model
,
(
int64_t
)
0
,
(
int64_t
)
0
,
(
int64_t
)
0
,
&
alpha
,
&
beta
,
input_hidden_state
,
ffn_weights
->
intermediate_weight
.
kernel
,
ffn_weights
->
intermediate_weight
.
input_scale
,
ffn_weights
->
intermediate_weight
.
per_channel_scale_min
,
// identity_scale
stream_
);
}
else
if
(
fp8_mode_
==
2
)
{
const
float
alpha
=
1.0
f
;
const
float
beta
=
0.0
f
;
reinterpret_cast
<
cublasFP8MMWrapper
*>
(
cublas_wrapper_
)
->
Gemm
(
inter_buf_bf16_
,
(
int
)
1
,
(
int
)
m
,
(
int
)
inter_size_
,
(
int
)
d_model
,
(
int64_t
)
0
,
(
int64_t
)
0
,
(
int64_t
)
0
,
&
alpha
,
&
beta
,
input_hidden_state
,
ffn_weights
->
intermediate_weight
.
kernel
,
ffn_weights
->
intermediate_weight
.
input_scale
,
ffn_weights
->
intermediate_weight
.
weight_scale
,
stream_
);
}
#ifdef SPARSITY_ENABLED
}
#endif // SPARSITY_ENABLED
POP_RANGE
;
PUSH_RANGE
(
"FFN add bias act"
);
if
(
fp8_mode_
==
1
)
{
invokeAddBiasActivation
(
m
,
ffn_weights
->
intermediate_weight
.
bias
,
ffn_weights
->
intermediate_weight
.
output_scale
,
ffn_weights
->
intermediate_weight
.
scale
,
ffn_weights
->
intermediate_weight
.
per_channel_scale_min
,
ffn_weights
->
output_weight
.
input_scale_inv
);
}
else
if
(
fp8_mode_
==
2
)
{
invokeAddBiasActivation
(
m
,
ffn_weights
->
intermediate_weight
.
bias
,
ffn_weights
->
intermediate_weight
.
output_scale
,
nullptr
,
nullptr
,
ffn_weights
->
output_weight
.
input_scale_inv
);
}
sync_check_cuda_error
();
POP_RANGE
;
#endif // FUSE_GEMM_ACT
PUSH_RANGE
(
"FFN gemm 2"
);
#ifdef SPARSITY_ENABLED
if
(
sparse_
&&
cublas_wrapper_
->
isUseSparse
(
1
,
d_model
,
m
,
inter_size_
))
{
FT_CHECK
(
false
);
// cublas_wrapper_->SpGemm(CUBLAS_OP_N,
// CUBLAS_OP_N,
// d_model,
// m_padded,
// inter_size_,
// ffn_weights->output_weight.sp_kernel,
// inter_buf_,
// output_tensor);
}
else
{
#endif SPARSITY_ENABLED
if
(
fp8_mode_
==
1
)
{
const
float
alpha
=
1.0
f
;
const
float
beta
=
0.0
f
;
if
(
output_tensor
.
type
==
TYPE_BF16
)
{
reinterpret_cast
<
cublasFP8MMWrapper
*>
(
cublas_wrapper_
)
->
Gemm
(
output_tensor
.
getPtr
<
T2
>
(),
(
int
)
1
,
(
int
)
m
,
(
int
)
d_model
,
(
int
)
inter_size_
,
(
int64_t
)
0
,
(
int64_t
)
0
,
(
int64_t
)
0
,
&
alpha
,
&
beta
,
(
const
__nv_fp8_e4m3
*
)
inter_buf_
,
(
const
__nv_fp8_e4m3
*
)
ffn_weights
->
output_weight
.
kernel
,
ffn_weights
->
output_weight
.
input_scale
,
ffn_weights
->
identity_scale
,
stream_
);
}
else
if
(
output_tensor
.
type
==
TYPE_FP8_E4M3
)
{
const
float
alpha
=
1.0
f
;
const
float
beta
=
0.0
f
;
reinterpret_cast
<
cublasFP8MMWrapper
*>
(
cublas_wrapper_
)
->
Gemm
(
output_tensor
.
getPtr
<
T1
>
(),
(
int
)
1
,
(
int
)
m
,
(
int
)
d_model
,
(
int
)
inter_size_
,
(
int64_t
)
0
,
(
int64_t
)
0
,
(
int64_t
)
0
,
&
alpha
,
&
beta
,
(
const
__nv_fp8_e4m3
*
)
inter_buf_
,
(
const
__nv_fp8_e4m3
*
)
ffn_weights
->
output_weight
.
kernel
,
ffn_weights
->
output_weight
.
input_scale
,
ffn_weights
->
output_weight
.
per_channel_scale_min
,
ffn_weights
->
output_weight
.
output_scale_inv
,
stream_
);
}
else
{
FT_CHECK
(
false
);
}
}
else
if
(
fp8_mode_
==
2
)
{
if
(
output_tensor
.
type
==
TYPE_BF16
)
{
const
float
alpha
=
1.0
f
;
const
float
beta
=
0.0
f
;
reinterpret_cast
<
cublasFP8MMWrapper
*>
(
cublas_wrapper_
)
->
Gemm
(
output_tensor
.
getPtr
<
T2
>
(),
(
int
)
1
,
(
int
)
m
,
(
int
)
d_model
,
(
int
)
inter_size_
,
(
int64_t
)
0
,
(
int64_t
)
0
,
(
int64_t
)
0
,
&
alpha
,
&
beta
,
(
const
__nv_fp8_e4m3
*
)
inter_buf_
,
(
const
__nv_fp8_e4m3
*
)
ffn_weights
->
output_weight
.
kernel
,
ffn_weights
->
output_weight
.
input_scale
,
ffn_weights
->
output_weight
.
weight_scale
,
stream_
);
}
else
if
(
output_tensor
.
type
==
TYPE_FP8_E4M3
)
{
// It looks like conv1x1Gemm does not bring better performance for this gemm
// because the k dimension of this gemm is large
// #ifdef USE_QGMMA
// reinterpret_cast<cublasFP8MMWrapper*>(cublas_wrapper_)
// ->Conv1x1Gemm<false, false>(output_tensor.getPtr<T1>(),
// m,
// d_model,
// inter_size_,
// inter_buf_,
// ffn_weights->output_weight.kernel,
// ffn_weights->output_weight.bias,
// *(ffn_weights->output_weight.input_h_scale), //
// scale_a,
// *(ffn_weights->output_weight.weight_h_scale), //
// scale_b,
// *(ffn_weights->output_weight.output_h_scale_inv), //
// scale_d, stream_);
// #else // USE_QGMMA
const
float
alpha
=
1.0
f
;
const
float
beta
=
0.0
f
;
reinterpret_cast
<
cublasFP8MMWrapper
*>
(
cublas_wrapper_
)
->
Gemm
(
output_tensor
.
getPtr
<
T1
>
(),
(
int
)
1
,
(
int
)
m
,
(
int
)
d_model
,
(
int
)
inter_size_
,
(
int64_t
)
0
,
(
int64_t
)
0
,
(
int64_t
)
0
,
&
alpha
,
&
beta
,
(
const
__nv_fp8_e4m3
*
)
inter_buf_
,
(
const
__nv_fp8_e4m3
*
)
ffn_weights
->
output_weight
.
kernel
,
ffn_weights
->
output_weight
.
input_scale
,
ffn_weights
->
output_weight
.
weight_scale
,
ffn_weights
->
output_weight
.
output_scale_inv
,
stream_
);
// #endif // USE_QGMMA
}
else
{
FT_CHECK
(
false
);
}
}
#ifdef SPARSITY_ENABLED
}
#endif // SPARSITY_ENABLED
POP_RANGE
;
sync_check_cuda_error
();
if
(
is_free_buffer_after_forward_
==
true
)
{
freeBuffer
();
}
sync_check_cuda_error
();
}
template
<
typename
T1
,
typename
T2
>
FfnFP8Layer
<
T1
,
T2
>::
FfnFP8Layer
(
size_t
inter_size
,
int
fp8_mode
,
cudaStream_t
stream
,
cublasMMWrapper
*
cublas_wrapper
,
IAllocator
*
allocator
,
bool
is_free_buffer_after_forward
,
bool
sparse
)
:
BaseLayer
(
stream
,
cublas_wrapper
,
allocator
,
is_free_buffer_after_forward
,
nullptr
,
sparse
),
inter_size_
(
inter_size
),
fp8_mode_
(
fp8_mode
)
{
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
}
template
<
typename
T1
,
typename
T2
>
FfnFP8Layer
<
T1
,
T2
>::
FfnFP8Layer
(
FfnFP8Layer
<
T1
,
T2
>
const
&
ffn_layer
)
:
BaseLayer
(
ffn_layer
.
stream_
,
ffn_layer
.
cublas_wrapper_
,
ffn_layer
.
allocator_
,
ffn_layer
.
is_free_buffer_after_forward_
,
ffn_layer
.
cuda_device_prop_
,
ffn_layer
.
sparse_
),
inter_size_
(
ffn_layer
.
inter_size_
),
fp8_mode_
(
ffn_layer
.
fp8_mode_
)
{
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
}
template
<
typename
T1
,
typename
T2
>
FfnFP8Layer
<
T1
,
T2
>::~
FfnFP8Layer
()
{
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
cublas_wrapper_
=
nullptr
;
freeBuffer
();
}
template
<
typename
T1
,
typename
T2
>
void
FfnFP8Layer
<
T1
,
T2
>::
allocateBuffer
()
{
FT_CHECK
(
false
);
}
template
<
typename
T1
,
typename
T2
>
void
FfnFP8Layer
<
T1
,
T2
>::
allocateBuffer
(
size_t
token_num
)
{
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
inter_buf_
=
(
T1
*
)
allocator_
->
reMalloc
(
inter_buf_
,
sizeof
(
T1
)
*
token_num
*
inter_size_
,
false
);
inter_buf_bf16_
=
(
T2
*
)
allocator_
->
reMalloc
(
inter_buf_bf16_
,
sizeof
(
T2
)
*
token_num
*
inter_size_
,
false
);
is_allocate_buffer_
=
true
;
}
template
<
typename
T1
,
typename
T2
>
void
FfnFP8Layer
<
T1
,
T2
>::
freeBuffer
()
{
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
if
(
is_allocate_buffer_
)
{
allocator_
->
free
((
void
**
)(
&
inter_buf_
));
allocator_
->
free
((
void
**
)(
&
inter_buf_bf16_
));
is_allocate_buffer_
=
false
;
}
}
template
class
FfnFP8Layer
<
__nv_fp8_e4m3
,
__nv_bfloat16
>;
template
<
typename
T1
,
typename
T2
>
GeluFfnFP8Layer
<
T1
,
T2
>::
GeluFfnFP8Layer
(
size_t
inter_size
,
int
fp8_mode
,
cudaStream_t
stream
,
cublasMMWrapper
*
cublas_wrapper
,
IAllocator
*
allocator
,
bool
is_free_buffer_after_forward
,
bool
sparse
)
:
FfnFP8Layer
<
T1
,
T2
>
(
inter_size
,
fp8_mode
,
stream
,
cublas_wrapper
,
allocator
,
is_free_buffer_after_forward
,
sparse
)
{
}
template
<
typename
T1
,
typename
T2
>
GeluFfnFP8Layer
<
T1
,
T2
>::
GeluFfnFP8Layer
(
GeluFfnFP8Layer
<
T1
,
T2
>
const
&
gelu_ffn_layer
)
:
FfnFP8Layer
<
T1
,
T2
>
(
gelu_ffn_layer
)
{
}
template
<
typename
T1
,
typename
T2
>
void
GeluFfnFP8Layer
<
T1
,
T2
>::
invokeAddBiasActivation
(
const
int
m
,
const
T2
*
bias
,
const
float
*
input_scale
,
const
float
*
input_scale_2
,
const
float
*
input_scale_2_min
,
const
float
*
output_scale
)
{
FP8ActivationParam
<
T1
,
T2
>
param
{
inter_buf_bf16_
,
inter_buf_
,
bias
,
input_scale
,
input_scale_2
,
input_scale_2_min
,
output_scale
,
(
uint32_t
)
m
,
(
uint32_t
)
inter_size_
,
stream_
};
invokeFP8AddBiasGelu
<
T1
,
T2
>
(
param
);
}
template
class
GeluFfnFP8Layer
<
__nv_fp8_e4m3
,
__nv_bfloat16
>;
template
<
typename
T1
,
typename
T2
>
ReluFfnFP8Layer
<
T1
,
T2
>::
ReluFfnFP8Layer
(
size_t
inter_size
,
int
fp8_mode
,
cudaStream_t
stream
,
cublasMMWrapper
*
cublas_wrapper
,
IAllocator
*
allocator
,
bool
is_free_buffer_after_forward
,
bool
sparse
)
:
FfnFP8Layer
<
T1
,
T2
>
(
inter_size
,
fp8_mode
,
stream
,
cublas_wrapper
,
allocator
,
is_free_buffer_after_forward
,
sparse
)
{
}
template
<
typename
T1
,
typename
T2
>
ReluFfnFP8Layer
<
T1
,
T2
>::
ReluFfnFP8Layer
(
ReluFfnFP8Layer
<
T1
,
T2
>
const
&
relu_ffn_layer
)
:
FfnFP8Layer
<
T1
,
T2
>
(
relu_ffn_layer
)
{
}
template
<
typename
T1
,
typename
T2
>
void
ReluFfnFP8Layer
<
T1
,
T2
>::
invokeAddBiasActivation
(
const
int
m
,
const
T2
*
bias
,
const
float
*
input_scale
,
const
float
*
input_scale_2
,
const
float
*
input_scale_2_min
,
const
float
*
output_scale
)
{
FP8ActivationParam
<
T1
,
T2
>
param
{
inter_buf_bf16_
,
inter_buf_
,
bias
,
input_scale
,
input_scale_2
,
input_scale_2_min
,
output_scale
,
(
uint32_t
)
m
,
(
uint32_t
)
inter_size_
,
stream_
};
invokeFP8AddBiasRelu
<
T1
,
T2
>
(
param
);
}
template
class
ReluFfnFP8Layer
<
__nv_fp8_e4m3
,
__nv_bfloat16
>;
}
// namespace turbomind
src/turbomind/layers/FfnFP8Layer.h
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/turbomind/layers/BaseLayer.h"
#include "src/turbomind/layers/FfnFP8Weight.h"
#include "src/turbomind/layers/FfnLayer.h"
#include "src/turbomind/utils/memory_utils.h"
#include <vector>
namespace
turbomind
{
template
<
typename
T1
,
typename
T2
>
class
FfnFP8Layer
:
public
BaseLayer
{
private:
void
allocateBuffer
()
override
;
void
freeBuffer
()
override
;
void
allocateBuffer
(
size_t
token_num
);
protected:
const
int
fp8_mode_
;
T1
*
inter_buf_
=
nullptr
;
T2
*
inter_buf_bf16_
=
nullptr
;
size_t
inter_size_
;
virtual
void
invokeAddBiasActivation
(
const
int
m
,
const
T2
*
bias
,
const
float
*
input_scale
,
const
float
*
input_scale_2
,
const
float
*
input_scale_2_min
,
const
float
*
output_scale
)
=
0
;
public:
FfnFP8Layer
(
size_t
inter_size
,
int
fp8_mode
,
cudaStream_t
stream
,
cublasMMWrapper
*
cublas_wrapper
,
IAllocator
*
allocator
,
bool
is_free_buffer_after_forward
,
bool
sparse
=
false
);
FfnFP8Layer
(
FfnFP8Layer
<
T1
,
T2
>
const
&
ffn_layer
);
virtual
~
FfnFP8Layer
();
virtual
void
forward
(
TensorMap
*
output_tensors
,
TensorMap
*
input_tensors
,
const
FfnFP8Weight
<
T1
,
T2
>*
ffn_weights
);
virtual
ActivationType
getActivationType
()
=
0
;
};
template
<
typename
T1
,
typename
T2
>
class
GeluFfnFP8Layer
:
public
FfnFP8Layer
<
T1
,
T2
>
{
public:
GeluFfnFP8Layer
(
size_t
inter_size
,
int
fp8_mode_
,
cudaStream_t
stream
,
cublasMMWrapper
*
cublas_wrapper
,
IAllocator
*
allocator
,
bool
is_free_buffer_after_forward
,
bool
sparse
=
false
);
GeluFfnFP8Layer
(
GeluFfnFP8Layer
<
T1
,
T2
>
const
&
ffn_layer
);
virtual
~
GeluFfnFP8Layer
()
=
default
;
ActivationType
getActivationType
()
override
{
return
ActivationType
::
Gelu
;
};
protected:
using
FfnFP8Layer
<
T1
,
T2
>::
stream_
;
private:
using
FfnFP8Layer
<
T1
,
T2
>::
inter_buf_
;
using
FfnFP8Layer
<
T1
,
T2
>::
inter_size_
;
using
FfnFP8Layer
<
T1
,
T2
>::
fp8_mode_
;
using
FfnFP8Layer
<
T1
,
T2
>::
inter_buf_bf16_
;
void
invokeAddBiasActivation
(
const
int
m
,
const
T2
*
bias
,
const
float
*
input_scale
,
const
float
*
input_scale_2
,
const
float
*
input_scale_2_min
,
const
float
*
output_scale
)
override
;
};
template
<
typename
T1
,
typename
T2
>
class
ReluFfnFP8Layer
:
public
FfnFP8Layer
<
T1
,
T2
>
{
public:
ReluFfnFP8Layer
(
size_t
inter_size
,
int
fp8_mode
,
cudaStream_t
stream
,
cublasMMWrapper
*
cublas_wrapper
,
IAllocator
*
allocator
,
bool
is_free_buffer_after_forward
,
bool
sparse
=
false
);
ReluFfnFP8Layer
(
ReluFfnFP8Layer
<
T1
,
T2
>
const
&
ffn_layer
);
virtual
~
ReluFfnFP8Layer
()
=
default
;
ActivationType
getActivationType
()
override
{
return
ActivationType
::
Relu
;
};
protected:
using
FfnFP8Layer
<
T1
,
T2
>::
stream_
;
private:
using
FfnFP8Layer
<
T1
,
T2
>::
inter_buf_
;
using
FfnFP8Layer
<
T1
,
T2
>::
inter_size_
;
using
FfnFP8Layer
<
T1
,
T2
>::
fp8_mode_
;
using
FfnFP8Layer
<
T1
,
T2
>::
inter_buf_bf16_
;
void
invokeAddBiasActivation
(
const
int
m
,
const
T2
*
bias
,
const
float
*
input_scale
,
const
float
*
input_scale_2
,
const
float
*
input_scale_2_min
,
const
float
*
output_scale
)
override
;
};
}
// namespace turbomind
src/turbomind/layers/FfnFP8Weight.h
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "FfnWeight.h"
#include "src/turbomind/utils/ScaleList.h"
namespace
turbomind
{
template
<
typename
T1
,
typename
T2
>
struct
FfnFP8Weight
:
FfnWeight
<
T1
,
T2
>
{
ScaleList
*
scale_list_ptr
;
float
*
identity_scale
;
float
*
identity_h_scale
;
};
}
// namespace turbomind
src/turbomind/layers/FfnINT8Weight.h
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "FfnWeight.h"
#include "src/turbomind/utils/ScaleList.h"
namespace
turbomind
{
template
<
typename
T
>
struct
FfnINT8Weight
:
FfnWeight
<
T
>
{
ScaleList
*
scale_list_ptr
;
};
}
// namespace turbomind
src/turbomind/layers/FfnLayerINT8.cc
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "FfnLayerINT8.h"
#include "src/turbomind/utils/nvtx_utils.h"
namespace
turbomind
{
template
<
typename
T
>
void
FfnLayerINT8
<
T
>::
forward
(
std
::
vector
<
turbomind
::
Tensor
>*
output_tensors
,
const
std
::
vector
<
turbomind
::
Tensor
>*
input_tensors
,
const
FfnWeight
<
T
>*
ffn_weights
)
{
// input_tensors: [input (token_num, hidden_dimension)]
// output_tensors: [output (token_num, hidden_dimension)]
ScaleList
*
scale_list
=
((
const
FfnINT8Weight
<
T
>*
)
ffn_weights
)
->
scale_list_ptr
;
cublasINT8MMWrapper
*
cublas_wrapper
=
(
cublasINT8MMWrapper
*
)
cublas_wrapper_
;
FT_CHECK
(
isValidTokenNum
(
input_tensors
->
at
(
0
).
shape
[
0
]));
allocateBuffer
();
const
int
m
=
static_cast
<
int
>
(
input_tensors
->
at
(
0
).
shape
[
0
]);
#ifdef SPARSITY_ENABLED
int
m_tmp
=
m
;
if
(
m_tmp
%
16
!=
0
)
{
m_tmp
=
(
m_tmp
/
16
+
1
)
*
16
;
}
const
int
m_padded
=
m_tmp
;
#endif
int32_t
*
output_tensor
=
output_tensors
->
at
(
0
).
getPtr
<
int32_t
>
();
const
int8_t
*
input_tensor
=
input_tensors
->
at
(
0
).
getPtr
<
const
int8_t
>
();
PUSH_RANGE
(
"FFN gemm 1"
);
if
(
int8_mode_
==
1
)
{
cublas_wrapper
->
Gemm
(
inter_int_buf_
,
1
,
m
,
inter_size_
,
hidden_units_
,
0
,
0
,
0
,
input_tensor
,
(
int8_t
*
)(
ffn_weights
->
intermediate_weight
.
kernel
));
}
else
if
(
int8_mode_
==
2
||
int8_mode_
==
3
)
{
#ifdef SPARSITY_ENABLED
if
(
sparse_
)
{
cublas_wrapper
->
SpGemm
(
inter_size_
,
m_padded
,
hidden_units_
,
scale_list
->
h_scale_list_
[
scale_list
->
p3_offset_
+
6
],
(
int8_t
*
)(
ffn_weights
->
intermediate_weight
.
sp_kernel
),
input_tensor
,
(
int8_t
*
)
inter_int_buf_
);
}
else
{
#endif
cublas_wrapper
->
Gemm
((
int8_t
*
)
inter_int_buf_
,
1
,
m
,
inter_size_
,
hidden_units_
,
0
,
0
,
0
,
scale_list
->
h_scale_list_
[
scale_list
->
p3_offset_
+
6
],
input_tensor
,
(
int8_t
*
)(
ffn_weights
->
intermediate_weight
.
kernel
));
#ifdef SPARSITY_ENABLED
}
#endif
}
POP_RANGE
;
PUSH_RANGE
(
"add bias act"
);
invokeAddBiasActivation
(
m
,
ffn_weights
->
intermediate_weight
.
bias
,
scale_list
);
POP_RANGE
;
sync_check_cuda_error
();
PUSH_RANGE
(
"FFN gemm 2"
);
if
(
int8_mode_
==
1
)
{
cublas_wrapper
->
Gemm
(
output_tensor
,
1
,
m
,
hidden_units_
,
inter_size_
,
0
,
0
,
0
,
inter_buf_
,
(
int8_t
*
)(
ffn_weights
->
output_weight
.
kernel
));
}
else
if
(
int8_mode_
==
2
||
int8_mode_
==
3
)
{
#ifdef SPARSITY_ENABLED
if
(
sparse_
)
{
cublas_wrapper
->
SpGemm
(
hidden_units_
,
m_padded
,
inter_size_
,
scale_list
->
h_scale_list_
[
scale_list
->
p3_offset_
+
7
],
(
int8_t
*
)(
ffn_weights
->
output_weight
.
sp_kernel
),
inter_buf_
,
(
int8_t
*
)
output_tensor
);
}
else
{
#endif
cublas_wrapper
->
Gemm
((
int8_t
*
)
output_tensor
,
1
,
m
,
hidden_units_
,
inter_size_
,
0
,
0
,
0
,
scale_list
->
h_scale_list_
[
scale_list
->
p3_offset_
+
7
],
inter_buf_
,
(
int8_t
*
)(
ffn_weights
->
output_weight
.
kernel
));
#ifdef SPARSITY_ENABLED
}
#endif
}
POP_RANGE
;
sync_check_cuda_error
();
if
(
is_free_buffer_after_forward_
==
true
)
{
freeBuffer
();
}
sync_check_cuda_error
();
}
template
<
typename
T
>
FfnLayerINT8
<
T
>::
FfnLayerINT8
(
size_t
max_batch_size
,
size_t
max_seq_len
,
size_t
head_num
,
size_t
size_per_head
,
size_t
inter_size
,
int
int8_mode
,
cudaStream_t
stream
,
cublasMMWrapper
*
cublas_wrapper
,
IAllocator
*
allocator
,
bool
is_free_buffer_after_forward
,
bool
sparse
)
:
BaseLayer
(
stream
,
cublas_wrapper
,
allocator
,
is_free_buffer_after_forward
),
max_token_num_
(
max_batch_size
*
max_seq_len
),
head_num_
(
head_num
),
size_per_head_
(
size_per_head
),
hidden_units_
(
head_num
*
size_per_head
),
inter_size_
(
inter_size
),
int8_mode_
(
int8_mode
),
sparse_
(
sparse
)
{
}
template
<
typename
T
>
FfnLayerINT8
<
T
>::
FfnLayerINT8
(
FfnLayerINT8
<
T
>
const
&
ffn_layer
)
:
BaseLayer
(
ffn_layer
.
stream_
,
ffn_layer
.
cublas_wrapper_
,
ffn_layer
.
allocator_
,
ffn_layer
.
is_free_buffer_after_forward_
),
max_token_num_
(
ffn_layer
.
max_token_num_
),
head_num_
(
ffn_layer
.
head_num_
),
size_per_head_
(
ffn_layer
.
size_per_head_
),
hidden_units_
(
ffn_layer
.
hidden_units_
),
inter_size_
(
ffn_layer
.
inter_size_
),
int8_mode_
(
ffn_layer
.
int8_mode_
),
sparse_
(
ffn_layer
.
sparse_
)
{
}
template
<
typename
T
>
FfnLayerINT8
<
T
>::~
FfnLayerINT8
()
{
cublas_wrapper_
=
nullptr
;
freeBuffer
();
}
template
<
typename
T
>
void
FfnLayerINT8
<
T
>::
allocateBuffer
()
{
if
(
is_allocate_buffer_
==
false
)
{
inter_int_buf_
=
(
int32_t
*
)
allocator_
->
reMalloc
(
inter_int_buf_
,
sizeof
(
int32_t
)
*
max_token_num_
*
inter_size_
,
false
);
inter_buf_
=
(
int8_t
*
)
allocator_
->
reMalloc
(
inter_buf_
,
sizeof
(
int8_t
)
*
max_token_num_
*
inter_size_
,
false
);
is_allocate_buffer_
=
true
;
}
}
template
<
typename
T
>
void
FfnLayerINT8
<
T
>::
freeBuffer
()
{
if
(
is_allocate_buffer_
==
true
)
{
allocator_
->
free
((
void
**
)(
&
inter_int_buf_
));
allocator_
->
free
((
void
**
)(
&
inter_buf_
));
is_allocate_buffer_
=
false
;
}
}
template
<
typename
T
>
bool
FfnLayerINT8
<
T
>::
isValidTokenNum
(
size_t
token_num
)
{
if
(
max_token_num_
==
0
)
{
max_token_num_
=
token_num
;
return
true
;
}
else
{
return
token_num
<=
max_token_num_
;
}
}
template
class
FfnLayerINT8
<
float
>;
template
class
FfnLayerINT8
<
half
>;
template
<
typename
T
>
GeluFfnLayerINT8
<
T
>::
GeluFfnLayerINT8
(
size_t
max_batch_size
,
size_t
max_seq_len
,
size_t
head_num
,
size_t
size_per_head
,
size_t
inter_size
,
int
int8_mode
,
cudaStream_t
stream
,
cublasMMWrapper
*
cublas_wrapper
,
IAllocator
*
allocator
,
bool
is_free_buffer_after_forward
,
bool
sparse
)
:
FfnLayerINT8
<
T
>
(
max_batch_size
,
max_seq_len
,
head_num
,
size_per_head
,
inter_size
,
int8_mode
,
stream
,
cublas_wrapper
,
allocator
,
is_free_buffer_after_forward
,
sparse
)
{
}
template
<
typename
T
>
GeluFfnLayerINT8
<
T
>::
GeluFfnLayerINT8
(
GeluFfnLayerINT8
<
T
>
const
&
gelu_ffn_layer
)
:
FfnLayerINT8
<
T
>
(
gelu_ffn_layer
)
{
}
template
<
typename
T
>
void
GeluFfnLayerINT8
<
T
>::
invokeAddBiasActivation
(
const
int
m
,
const
T
*
bias
,
ScaleList
*
scale_list
)
{
if
(
int8_mode_
==
1
)
{
invokeAddBiasGeluCol32
<
T
>
(
inter_buf_
,
inter_int_buf_
,
bias
,
m
,
inter_size_
,
stream_
,
&
(
scale_list
->
d_scale_list_
[
scale_list
->
p2_offset_
+
4
*
hidden_units_
]),
&
(
scale_list
->
d_scale_list_
[
44
+
2
]),
&
(
scale_list
->
d_scale_list_
[
52
+
3
]));
}
else
if
(
int8_mode_
==
2
||
int8_mode_
==
3
)
{
#ifdef SPARSITY_ENABLED
if
(
sparse_
)
{
invokeAddBiasGeluRow
<
T
>
(
inter_buf_
,
(
const
int8_t
*
)
inter_int_buf_
,
bias
,
m
,
inter_size_
,
stream_
,
&
(
scale_list
->
d_scale_list_
[
48
+
1
]),
&
(
scale_list
->
d_scale_list_
[
52
+
3
]));
}
else
{
#endif
invokeAddBiasGeluCol32
<
T
>
(
inter_buf_
,
(
const
int8_t
*
)
inter_int_buf_
,
bias
,
m
,
inter_size_
,
stream_
,
&
(
scale_list
->
d_scale_list_
[
48
+
1
]),
&
(
scale_list
->
d_scale_list_
[
52
+
3
]));
#ifdef SPARSITY_ENABLED
}
#endif
}
}
template
class
GeluFfnLayerINT8
<
float
>;
template
class
GeluFfnLayerINT8
<
half
>;
template
<
typename
T
>
ReluFfnLayerINT8
<
T
>::
ReluFfnLayerINT8
(
size_t
max_batch_size
,
size_t
max_seq_len
,
size_t
head_num
,
size_t
size_per_head
,
size_t
inter_size
,
int
int8_mode
,
cudaStream_t
stream
,
cublasMMWrapper
*
cublas_wrapper
,
IAllocator
*
allocator
,
bool
is_free_buffer_after_forward
)
:
FfnLayerINT8
<
T
>
(
max_batch_size
,
max_seq_len
,
head_num
,
size_per_head
,
inter_size
,
int8_mode
,
stream
,
cublas_wrapper
,
allocator
,
is_free_buffer_after_forward
)
{
}
template
<
typename
T
>
ReluFfnLayerINT8
<
T
>::
ReluFfnLayerINT8
(
ReluFfnLayerINT8
<
T
>
const
&
relu_ffn_layer
)
:
FfnLayerINT8
<
T
>
(
relu_ffn_layer
)
{
}
template
<
typename
T
>
void
ReluFfnLayerINT8
<
T
>::
invokeAddBiasActivation
(
const
int
m
,
const
T
*
bias
,
ScaleList
*
scale_list
)
{
// TODO
}
template
class
ReluFfnLayerINT8
<
float
>;
template
class
ReluFfnLayerINT8
<
half
>;
}
// namespace turbomind
src/turbomind/layers/FfnLayerINT8.h
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "FfnINT8Weight.h"
#include "src/turbomind/kernels/activation_int8_kernels.h"
#include "src/turbomind/layers/BaseLayer.h"
#include "src/turbomind/utils/ScaleList.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasINT8MMWrapper.h"
#include "src/turbomind/utils/memory_utils.h"
#include <vector>
namespace
turbomind
{
template
<
typename
T
>
class
GeluFfnLayerINT8
;
template
<
typename
T
>
class
ReluFfnLayerINT8
;
template
<
typename
T
>
class
FfnLayerINT8
:
public
BaseLayer
{
private:
// buffer handling
size_t
max_token_num_
=
0
;
// meta data
size_t
head_num_
;
size_t
size_per_head_
;
// calculated data
size_t
hidden_units_
;
void
allocateBuffer
()
override
;
void
freeBuffer
()
override
;
bool
isValidTokenNum
(
size_t
token_num
);
protected:
size_t
inter_size_
;
int
int8_mode_
;
bool
sparse_
;
int
*
inter_int_buf_
;
int8_t
*
inter_buf_
;
virtual
void
invokeAddBiasActivation
(
const
int
m
,
const
T
*
bias
,
ScaleList
*
scale_list
)
=
0
;
public:
FfnLayerINT8
(
size_t
max_batch_size
,
size_t
max_seq_len
,
size_t
head_num
,
size_t
size_per_head
,
size_t
inter_size
,
int
int8_mode
,
cudaStream_t
stream
,
cublasMMWrapper
*
cublas_wrapper
,
IAllocator
*
allocator
,
bool
is_free_buffer_after_forward
,
bool
sparse
=
false
);
FfnLayerINT8
(
FfnLayerINT8
<
T
>
const
&
ffn_layer
);
~
FfnLayerINT8
();
void
forward
(
std
::
vector
<
turbomind
::
Tensor
>*
output_tensors
,
const
std
::
vector
<
turbomind
::
Tensor
>*
input_tensors
,
const
FfnWeight
<
T
>*
ffn_weights
);
friend
GeluFfnLayerINT8
<
T
>
;
friend
ReluFfnLayerINT8
<
T
>
;
};
template
<
typename
T
>
class
GeluFfnLayerINT8
:
public
FfnLayerINT8
<
T
>
{
public:
GeluFfnLayerINT8
(
size_t
max_batch_size
,
size_t
max_seq_len
,
size_t
head_num
,
size_t
size_per_head
,
size_t
inter_size
,
int
int8_mode
,
cudaStream_t
stream
,
cublasMMWrapper
*
cublas_wrapper
,
IAllocator
*
allocator
,
bool
is_free_buffer_after_forward
,
bool
sparse
=
false
);
GeluFfnLayerINT8
(
GeluFfnLayerINT8
<
T
>
const
&
ffn_layer
);
~
GeluFfnLayerINT8
()
=
default
;
private:
using
FfnLayerINT8
<
T
>::
inter_int_buf_
;
using
FfnLayerINT8
<
T
>::
inter_buf_
;
using
FfnLayerINT8
<
T
>::
inter_size_
;
using
FfnLayerINT8
<
T
>::
stream_
;
using
FfnLayerINT8
<
T
>::
int8_mode_
;
using
FfnLayerINT8
<
T
>::
sparse_
;
using
FfnLayerINT8
<
T
>::
hidden_units_
;
void
invokeAddBiasActivation
(
const
int
m
,
const
T
*
bias
,
ScaleList
*
scale_list
)
override
;
};
template
<
typename
T
>
class
ReluFfnLayerINT8
:
public
FfnLayerINT8
<
T
>
{
public:
ReluFfnLayerINT8
(
size_t
max_batch_size
,
size_t
max_seq_len
,
size_t
head_num
,
size_t
size_per_head
,
size_t
inter_size
,
int
int8_mode
,
cudaStream_t
stream
,
cublasMMWrapper
*
cublas_wrapper
,
IAllocator
*
allocator
,
bool
is_free_buffer_after_forward
);
ReluFfnLayerINT8
(
ReluFfnLayerINT8
<
T
>
const
&
ffn_layer
);
~
ReluFfnLayerINT8
()
=
default
;
private:
using
FfnLayerINT8
<
T
>::
inter_int_buf_
;
using
FfnLayerINT8
<
T
>::
inter_buf_
;
using
FfnLayerINT8
<
T
>::
inter_size_
;
using
FfnLayerINT8
<
T
>::
stream_
;
using
FfnLayerINT8
<
T
>::
int8_mode_
;
using
FfnLayerINT8
<
T
>::
hidden_units_
;
void
invokeAddBiasActivation
(
const
int
m
,
const
T
*
bias
,
ScaleList
*
scale_list
)
override
;
};
}
// namespace turbomind
src/turbomind/layers/attention_layers_fp8/AttentionFP8Weight.h
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/turbomind/layers/attention_layers/AttentionWeight.h"
#include "src/turbomind/utils/ScaleList.h"
namespace
turbomind
{
template
<
typename
T1
,
typename
T2
>
struct
AttentionFP8Weight
:
public
AttentionWeight
<
T1
,
T2
>
{
const
float
*
qk_scale
;
const
float
*
qk_scale_inv
;
float
*
qk_h_scale
;
float
*
qk_h_scale_inv
;
float
*
identity_scale
;
float
*
identity_h_scale
;
};
}
// namespace turbomind
src/turbomind/layers/attention_layers_fp8/BaseAttentionFP8Layer.h
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <assert.h>
#include <vector>
#include "src/turbomind/layers/BaseLayer.h"
#include "src/turbomind/layers/attention_layers/BaseAttentionLayer.h"
#include "src/turbomind/layers/attention_layers_fp8/AttentionFP8Weight.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasFP8MMWrapper.h"
#include "src/turbomind/utils/memory_utils.h"
namespace
turbomind
{
// template<typename T>
// AttentionType getAttentionType(size_t size_per_head, const int sm, const bool remove_padding, const int max_seq_len,
// const bool is_fuse = true)
// {
// if (std::is_same<T, half>::value && (sm == kSM_70 || sm == kSM_86 || sm == kSM_80 || sm == kSM_75 || sm ==
// kSM_72)
// && size_per_head == 64 && max_seq_len <= 384 && is_fuse == true) {
// return remove_padding ? AttentionType::FUSED_MHA : AttentionType::FUSED_PADDED_MHA;
// }
// else {
// return remove_padding ? AttentionType::UNFUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
// }
// }
template
<
typename
T1
,
typename
T2
>
class
BaseAttentionFP8Layer
:
public
BaseLayer
{
public:
virtual
void
forward
(
TensorMap
*
output_tensors
,
TensorMap
*
input_tensors
,
const
AttentionFP8Weight
<
T1
,
T2
>*
attention_weights
)
=
0
;
BaseAttentionFP8Layer
(
cudaStream_t
stream
,
cublasMMWrapper
*
cublas_wrapper
,
IAllocator
*
allocator
,
bool
is_free_buffer_after_forward
,
bool
sparse
=
false
)
:
BaseLayer
(
stream
,
cublas_wrapper
,
allocator
,
is_free_buffer_after_forward
,
nullptr
,
sparse
)
{
}
virtual
~
BaseAttentionFP8Layer
()
=
default
;
};
}
// namespace turbomind
src/turbomind/layers/attention_layers_fp8/CMakeLists.txt
deleted
100644 → 0
View file @
83697422
# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
cmake_minimum_required
(
VERSION 3.8
)
src/turbomind/layers/attention_layers_int8/AttentionINT8Weight.h
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/turbomind/layers/attention_layers/AttentionWeight.h"
#include "src/turbomind/utils/ScaleList.h"
namespace
turbomind
{
template
<
typename
T
>
struct
AttentionINT8Weight
:
AttentionWeight
<
T
>
{
ScaleList
*
scale_list_ptr
;
};
}
// namespace turbomind
src/turbomind/layers/attention_layers_int8/CMakeLists.txt
deleted
100644 → 0
View file @
83697422
# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
cmake_minimum_required
(
VERSION 3.8
)
src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.cu
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/turbomind/kernels/beam_search_penalty_kernels.h"
#include "src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h"
#include "src/turbomind/utils/cuda_utils.h"
namespace
turbomind
{
__global__
void
update_indir_cache_kernel
(
int
*
tgt_indir_cache
,
const
int
*
src_indir_cache
,
const
int
*
beam_ids
,
const
bool
*
finished
,
int
start_step
,
int
batch_dim
,
int
local_batch_size
,
int
beam_width
,
int
max_seq_len
,
int
step
)
{
int
time_step
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
int
bb_id
=
threadIdx
.
y
+
blockIdx
.
y
*
blockDim
.
y
;
const
int
batch_id
=
bb_id
/
beam_width
;
const
int
beam_id
=
bb_id
%
beam_width
;
if
(
bb_id
>=
beam_width
*
local_batch_size
||
time_step
>=
min
(
step
+
1
,
max_seq_len
)
||
finished
[
bb_id
])
{
return
;
}
time_step
+=
start_step
;
const
int
time_step_circ
=
time_step
%
max_seq_len
;
const
int
src_beam
=
beam_ids
[
batch_id
*
beam_width
+
beam_id
];
const
uint
tgt_offset
=
batch_id
*
beam_width
*
max_seq_len
+
beam_id
*
max_seq_len
+
time_step_circ
;
const
uint
src_offset
=
batch_id
*
beam_width
*
max_seq_len
+
src_beam
*
max_seq_len
+
time_step_circ
;
tgt_indir_cache
[
tgt_offset
]
=
(
time_step
==
step
)
?
beam_id
:
src_indir_cache
[
src_offset
];
}
void
update_indir_cache_kernelLauncher
(
int
*
tgt_indir_cache
,
const
int
*
src_indir_cache
,
const
int
*
beam_ids
,
const
bool
*
finished
,
int
batch_dim
,
int
local_batch_size
,
int
beam_width
,
int
max_seq_len
,
int
step
,
cudaStream_t
stream
)
{
const
dim3
block
(
32
);
const
int
start_step
=
max
(
0
,
step
+
1
-
max_seq_len
);
const
int
num_steps
=
min
(
step
+
1
,
max_seq_len
);
// Update indirections steps [start_step, step], included
const
dim3
grid
((
num_steps
+
block
.
x
-
1
)
/
block
.
x
,
local_batch_size
*
beam_width
);
update_indir_cache_kernel
<<<
grid
,
block
,
0
,
stream
>>>
(
tgt_indir_cache
,
src_indir_cache
,
beam_ids
,
finished
,
start_step
,
batch_dim
,
local_batch_size
,
beam_width
,
max_seq_len
,
step
);
}
template
<
typename
T
>
BaseBeamSearchLayer
<
T
>::
BaseBeamSearchLayer
(
size_t
max_batch_size
,
size_t
head_num
,
size_t
size_per_head
,
size_t
beam_width
,
size_t
vocab_size
,
size_t
vocab_size_padded
,
int
end_id
,
float
diversity_rate
,
float
temperature
,
float
len_penalty
,
float
repetition_penalty
,
cudaStream_t
stream
,
cublasMMWrapper
*
cublas_wrapper
,
IAllocator
*
allocator
,
bool
is_free_buffer_after_forward
)
:
DynamicDecodeBaseLayer
(
stream
,
cublas_wrapper
,
allocator
,
is_free_buffer_after_forward
,
nullptr
),
vocab_size_
(
vocab_size
),
vocab_size_padded_
(
vocab_size_padded
)
{
}
template
<
typename
T
>
BaseBeamSearchLayer
<
T
>::
BaseBeamSearchLayer
(
BaseBeamSearchLayer
<
T
>
const
&
beam_search_layer
)
:
DynamicDecodeBaseLayer
(
beam_search_layer
),
vocab_size_
(
beam_search_layer
.
vocab_size_
),
vocab_size_padded_
(
beam_search_layer
.
vocab_size_padded_
),
topk_softmax_workspace_size_
(
beam_search_layer
.
topk_softmax_workspace_size_
)
{
}
template
<
typename
T
>
BaseBeamSearchLayer
<
T
>::~
BaseBeamSearchLayer
()
{
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
freeBuffer
();
}
template
<
typename
T
>
void
BaseBeamSearchLayer
<
T
>::
freeBuffer
()
{
if
(
is_allocate_buffer_
)
{
allocator_
->
free
((
void
**
)(
&
topk_softmax_workspace_
));
is_allocate_buffer_
=
false
;
}
}
template
<
typename
T
>
void
BaseBeamSearchLayer
<
T
>::
setup
(
const
size_t
batch_size
,
const
size_t
beam_width
,
TensorMap
*
runtime_args
)
{
// do nothing.
}
template
<
typename
T
>
void
BaseBeamSearchLayer
<
T
>::
forward
(
std
::
vector
<
Tensor
>*
output_tensors
,
const
std
::
vector
<
Tensor
>*
input_tensors
)
{
// input_tensors:
// logits [local_batch_size, beam_width, vocab_size_padded]
// embedding_bias [vocab_size_padded]
// step [1] on cpu
// src_cache_indirection [local_batch_size, beam_width, max_seq_len]
// max_input_length [1] on cpu
// input_lengths [local_batch_size * beam_width]
// ite [1] on cpu
// output_tensors:
// output_ids [max_seq_len, batch_size, beam_width]
// finished [local_batch_size * beam_width]
// cum_log_probs [local_batch_size * beam_width]
// parent_ids [max_seq_len, batch_size * beam_width]
// sequence_length [local_batch_size * beam_width]
// tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
std
::
unordered_map
<
std
::
string
,
Tensor
>
input_tensors_map
{{
"logits"
,
input_tensors
->
at
(
0
)},
{
"embedding_bias"
,
input_tensors
->
at
(
1
)},
{
"step"
,
input_tensors
->
at
(
2
)},
{
"src_cache_indirection"
,
input_tensors
->
at
(
4
)},
{
"max_input_length"
,
input_tensors
->
at
(
5
)},
{
"input_lengths"
,
input_tensors
->
at
(
6
)},
{
"ite"
,
input_tensors
->
at
(
7
)}};
std
::
unordered_map
<
std
::
string
,
Tensor
>
output_tensors_map
{{
"output_ids"
,
output_tensors
->
at
(
0
)},
{
"finished"
,
output_tensors
->
at
(
1
)},
{
"cum_log_probs"
,
output_tensors
->
at
(
2
)},
{
"parent_ids"
,
output_tensors
->
at
(
3
)},
{
"sequence_length"
,
output_tensors
->
at
(
4
)},
{
"tgt_cache_indirection"
,
output_tensors
->
at
(
5
)}};
forward
(
&
output_tensors_map
,
&
input_tensors_map
);
}
template
<
typename
T
>
void
BaseBeamSearchLayer
<
T
>::
forward
(
std
::
unordered_map
<
std
::
string
,
Tensor
>*
output_tensors
,
const
std
::
unordered_map
<
std
::
string
,
Tensor
>*
input_tensors
)
{
TensorMap
input_map
(
*
input_tensors
);
TensorMap
output_map
(
*
output_tensors
);
forward
(
&
output_map
,
&
input_map
);
}
template
<
typename
T
>
void
BaseBeamSearchLayer
<
T
>::
forward
(
TensorMap
*
output_tensors
,
TensorMap
*
input_tensors
)
{
// input_tensors:
// logits [local_batch_size, beam_width, vocab_size_padded]
// embedding_bias [vocab_size_padded]
// step [1] on cpu
// src_cache_indirection [local_batch_size, beam_width, max_seq_len]
// end_id [local_batch_size]
// max_input_length [1] on cpu
// input_lengths [local_batch_size * beam_width], optional
// ite [1] on cpu
// beam_search_diversity_rate [1] on cpu, optional
// temperature [1] on cpu, optional
// len_penalty [1] on cpu, optional
// repetition_penalty [1] on cpu, optional
// presence_penalty [1] on cpu, optional
// Only one of repetition and presence penalties is allowed.
// min_length [1] on cpu, int, optional
// output_tensors:
// output_ids [max_seq_len, batch_size, beam_width]
// finished [local_batch_size * beam_width], optional
// cum_log_probs [local_batch_size * beam_width]
// parent_ids [max_seq_len, batch_size * beam_width]
// sequence_length [local_batch_size * beam_width], optional
// tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
// output_log_probs [max_seq_len, batch_size, beam_width], optional
// beam_hyps, optional
FT_CHECK
(
input_tensors
->
size
()
>=
7
);
FT_CHECK
(
output_tensors
->
size
()
>=
5
);
const
int
batch_size
=
output_tensors
->
at
(
"output_ids"
).
shape
[
1
];
const
int
beam_width
=
output_tensors
->
at
(
"output_ids"
).
shape
[
2
];
allocateBuffer
(
batch_size
,
beam_width
);
const
int
step
=
input_tensors
->
at
(
"step"
).
getVal
<
int
>
();
const
int
ite
=
input_tensors
->
at
(
"ite"
).
getVal
<
int
>
();
const
int
local_batch_size
=
input_tensors
->
at
(
"logits"
).
shape
[
0
];
const
float
temperature
=
input_tensors
->
getVal
<
float
>
(
"temperature"
,
1.0
f
);
const
T
*
embedding_bias
=
input_tensors
->
getPtr
<
const
T
>
(
"embedding_bias"
,
nullptr
);
RepetitionPenaltyType
repetition_penalty_type
=
RepetitionPenaltyType
::
None
;
float
repetition_penalty
=
getDefaultPenaltyValue
(
repetition_penalty_type
);
if
(
input_tensors
->
isExist
(
"repetition_penalty"
)
||
input_tensors
->
isExist
(
"presence_penalty"
))
{
FT_CHECK_WITH_INFO
(
!
(
input_tensors
->
isExist
(
"repetition_penalty"
)
&&
input_tensors
->
isExist
(
"presence_penalty"
)),
"Found ambiguous parameters repetition_penalty and presence_penalty which are mutually exclusive. "
"Please provide one of repetition_penalty or presence_penalty."
);
repetition_penalty_type
=
input_tensors
->
isExist
(
"repetition_penalty"
)
?
RepetitionPenaltyType
::
Multiplicative
:
RepetitionPenaltyType
::
Additive
;
repetition_penalty
=
repetition_penalty_type
==
RepetitionPenaltyType
::
Multiplicative
?
input_tensors
->
getVal
<
float
>
(
"repetition_penalty"
)
:
input_tensors
->
getVal
<
float
>
(
"presence_penalty"
);
}
invokeAddBiasApplyPenalties
(
step
,
input_tensors
->
at
(
"logits"
).
getPtr
<
T
>
(),
output_tensors
->
at
(
"output_ids"
)
.
getPtrWithOffset
<
const
int
>
((
step
-
1
)
*
batch_size
*
beam_width
+
ite
*
local_batch_size
*
beam_width
),
output_tensors
->
getPtr
<
const
int
>
(
"output_ids"
),
output_tensors
->
getPtr
<
const
int
>
(
"parent_ids"
),
input_tensors
->
getPtr
<
const
int
>
(
"input_lengths"
,
nullptr
),
output_tensors
->
getPtr
<
const
int
>
(
"sequence_length"
,
nullptr
),
embedding_bias
,
ite
,
input_tensors
->
getVal
<
int
>
(
"max_input_length"
),
local_batch_size
,
batch_size
,
beam_width
,
vocab_size_
,
vocab_size_padded_
,
input_tensors
->
getPtr
<
const
int
>
(
"end_id"
,
nullptr
),
temperature
,
repetition_penalty
,
repetition_penalty_type
,
input_tensors
->
getVal
<
const
int
>
(
"min_length"
,
0
),
stream_
);
sync_check_cuda_error
();
invokeSoftMax
(
output_tensors
,
input_tensors
);
if
(
beam_width
>
1
)
{
const
int
max_seq_len
=
output_tensors
->
at
(
"output_ids"
).
shape
[
0
];
update_indir_cache_kernelLauncher
(
output_tensors
->
at
(
"tgt_cache_indirection"
).
getPtr
<
int
>
(),
input_tensors
->
at
(
"src_cache_indirection"
).
getPtr
<
const
int
>
(),
output_tensors
->
at
(
"parent_ids"
)
.
getPtrWithOffset
<
const
int
>
(
+
step
*
beam_width
*
batch_size
+
ite
*
local_batch_size
*
beam_width
),
output_tensors
->
at
(
"finished"
).
getPtr
<
const
bool
>
(),
batch_size
,
local_batch_size
,
beam_width
,
max_seq_len
,
step
,
stream_
);
sync_check_cuda_error
();
}
sync_check_cuda_error
();
if
(
is_free_buffer_after_forward_
)
{
freeBuffer
();
}
sync_check_cuda_error
();
}
template
class
BaseBeamSearchLayer
<
float
>;
template
class
BaseBeamSearchLayer
<
half
>;
}
// namespace turbomind
src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/turbomind/kernels/penalty_types.h"
#include "src/turbomind/layers/DynamicDecodeBaseLayer.h"
namespace
turbomind
{
template
<
typename
T
>
class
BaseBeamSearchLayer
:
public
DynamicDecodeBaseLayer
{
private:
void
freeBuffer
();
protected:
// meta data
size_t
vocab_size_
;
size_t
vocab_size_padded_
;
size_t
topk_softmax_workspace_size_
;
void
*
topk_softmax_workspace_
=
nullptr
;
virtual
void
allocateBuffer
()
=
0
;
virtual
void
allocateBuffer
(
size_t
batch_size
,
size_t
beam_width
)
=
0
;
virtual
void
invokeSoftMax
(
TensorMap
*
output_tensors
,
TensorMap
*
input_tensors
)
=
0
;
public:
BaseBeamSearchLayer
(
size_t
max_batch_size
,
size_t
head_num
,
size_t
size_per_head
,
size_t
beam_width
,
size_t
vocab_size
,
size_t
vocab_size_padded
,
int
end_id
,
float
diversity_rate
,
float
temperature
,
float
len_penalty
,
float
repetition_penalty
,
cudaStream_t
stream
,
cublasMMWrapper
*
cublas_wrapper
,
IAllocator
*
allocator
,
bool
is_free_buffer_after_forward
);
BaseBeamSearchLayer
(
BaseBeamSearchLayer
<
T
>
const
&
beam_search_layer
);
~
BaseBeamSearchLayer
();
void
setup
(
const
size_t
batch_size
,
const
size_t
beam_width
,
TensorMap
*
runtime_args
)
override
;
void
forward
(
std
::
vector
<
turbomind
::
Tensor
>*
output_tensors
,
const
std
::
vector
<
turbomind
::
Tensor
>*
input_tensors
)
override
;
void
forward
(
std
::
unordered_map
<
std
::
string
,
Tensor
>*
output_tensors
,
const
std
::
unordered_map
<
std
::
string
,
Tensor
>*
input_tensors
)
override
;
void
forward
(
TensorMap
*
output_tensors
,
TensorMap
*
input_tensors
)
override
;
};
void
update_indir_cache_kernelLauncher
(
int
*
tgt_indir_cache
,
const
int
*
src_indir_cache
,
const
int
*
beam_ids
,
const
bool
*
finished
,
int
batch_dim
,
int
beam_width
,
int
max_seq_len
,
int
ite
,
cudaStream_t
stream
);
}
// namespace turbomind
src/turbomind/layers/beam_search_layers/BeamSearchLayer.cu
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/layers/beam_search_layers/BeamSearchLayer.h"
namespace
turbomind
{
template
<
typename
T
>
__global__
void
logProbAddCumLogProb
(
float
*
log_probs
,
const
T
*
logits
,
const
float
*
cum_log_probs
,
const
int
*
end_ids
,
const
bool
*
finished
,
const
int
beam_width
,
const
int
n
)
{
int
bid
=
blockIdx
.
x
;
bool
finish
=
finished
!=
nullptr
?
finished
[
bid
]
:
false
;
int
offset
=
bid
*
n
;
float
max_val
=
-
1
*
FLT_MAX
;
__shared__
float
s_max_val
;
__shared__
float
s_sum_val
;
if
(
finish
)
{
for
(
int
tid
=
threadIdx
.
x
;
tid
<
n
;
tid
+=
blockDim
.
x
)
{
log_probs
[
offset
+
tid
]
=
(
tid
==
end_ids
[
bid
/
beam_width
])
?
cum_log_probs
[
bid
]
:
-
FLT_MAX
;
}
}
else
{
for
(
int
tid
=
threadIdx
.
x
;
tid
<
n
;
tid
+=
blockDim
.
x
)
{
log_probs
[
offset
+
tid
]
=
(
float
)(
logits
[
offset
+
tid
]);
max_val
=
max
(
max_val
,
log_probs
[
offset
+
tid
]);
}
max_val
=
blockReduceMax
(
max_val
);
if
(
threadIdx
.
x
==
0
)
{
s_max_val
=
max_val
;
}
__syncthreads
();
float
sum_val
=
0.0
f
;
for
(
int
tid
=
threadIdx
.
x
;
tid
<
n
;
tid
+=
blockDim
.
x
)
{
log_probs
[
offset
+
tid
]
=
__expf
(
log_probs
[
offset
+
tid
]
-
s_max_val
);
sum_val
+=
log_probs
[
offset
+
tid
];
}
sum_val
=
blockReduceSum
(
sum_val
);
if
(
threadIdx
.
x
==
0
)
{
s_sum_val
=
sum_val
+
1e-6
f
;
}
__syncthreads
();
for
(
int
tid
=
threadIdx
.
x
;
tid
<
n
;
tid
+=
blockDim
.
x
)
{
log_probs
[
offset
+
tid
]
=
logf
(
log_probs
[
offset
+
tid
]
/
s_sum_val
)
+
cum_log_probs
[
bid
];
}
}
}
template
<
typename
T
>
void
invokeLogProbAddCumLogProb
(
float
*
log_probs
,
const
T
*
logits
,
const
float
*
cum_log_probs
,
const
int
*
end_ids
,
const
bool
*
finished
,
const
int
m
,
const
int
beam_width
,
const
int
n
,
cudaStream_t
stream
)
{
dim3
grid
(
m
);
dim3
block
(
min
(
n
,
1024
));
/*n is the vocab_size, e.g., 30000, 7000.... vocab_size is usually very big. */
logProbAddCumLogProb
<<<
grid
,
block
,
0
,
stream
>>>
(
log_probs
,
logits
,
cum_log_probs
,
end_ids
,
finished
,
beam_width
,
n
);
}
template
<
typename
T
>
__global__
void
updateStatesKernel
(
T
*
log_probs
,
T
*
cum_log_probs
,
float
*
output_log_probs
,
bool
*
finished
,
int
*
parent_ids
,
int
*
sequence_length
,
int
*
word_ids
,
int
*
output_ids
,
BeamHypotheses
beam_hyps
,
const
int
local_batch_size
,
const
int
beam_width
,
const
int
vocab_size
,
const
int
*
end_ids
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
local_batch_size
*
beam_width
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
batch_id
=
index
/
beam_width
;
sequence_length
[
index
]
=
finished
[
index
]
?
sequence_length
[
index
]
:
sequence_length
[
index
]
+
1
;
int
beam_id
=
(
word_ids
[
index
]
/
vocab_size
)
%
beam_width
;
int
word_id
=
word_ids
[
index
]
%
vocab_size
;
if
(
output_log_probs
!=
nullptr
)
{
// get the cum_log_probs of previous run
output_log_probs
[
index
]
=
log_probs
[
batch_id
*
beam_width
*
vocab_size
+
beam_id
*
vocab_size
+
word_id
]
-
cum_log_probs
[
batch_id
*
beam_width
+
beam_id
];
}
cum_log_probs
[
index
]
=
log_probs
[
batch_id
*
beam_width
*
vocab_size
+
beam_id
*
vocab_size
+
word_id
];
sequence_length
[
index
]
=
sequence_length
[
batch_id
*
beam_width
+
beam_id
];
finished
[
index
]
=
word_id
==
end_ids
[
batch_id
]
?
1
:
0
;
parent_ids
[
index
]
=
beam_id
;
word_ids
[
index
]
=
word_id
;
output_ids
[
index
]
=
word_id
;
if
(
beam_hyps
.
num_beams
!=
nullptr
)
{
if
(
beam_hyps
.
num_beams
[
beam_hyps
.
ite
*
beam_hyps
.
local_batch_size
+
batch_id
]
==
beam_width
)
{
for
(
int
i
=
0
;
i
<
beam_width
;
i
++
)
{
finished
[
batch_id
*
beam_width
+
i
]
=
true
;
}
}
}
}
}
void
invokeUpdateStates
(
float
*
log_probs
,
float
*
cum_log_probs
,
float
*
output_log_probs
,
bool
*
finished
,
int
*
parent_ids
,
int
*
sequence_length
,
int
*
word_ids
,
int
*
output_ids
,
BeamHypotheses
*
beam_hyps
,
const
int
local_batch_size
,
const
int
beam_width
,
const
int
vocab_size
,
const
int
*
end_ids
,
cudaStream_t
stream
)
{
dim3
grid
((
int
)
ceil
(
local_batch_size
*
beam_width
*
1.0
/
256
));
dim3
block
(
256
);
updateStatesKernel
<
float
><<<
grid
,
block
,
0
,
stream
>>>
(
log_probs
,
cum_log_probs
,
output_log_probs
,
finished
,
parent_ids
,
sequence_length
,
word_ids
,
output_ids
,
*
beam_hyps
,
local_batch_size
,
beam_width
,
vocab_size
,
end_ids
);
}
template
<
typename
T
>
void
BeamSearchLayer
<
T
>::
invokeSoftMax
(
TensorMap
*
output_tensors
,
TensorMap
*
input_tensors
)
{
// input_tensors:
// logits [local_batch_size, beam_width, vocab_size_padded]
// embedding_bias [vocab_size_padded]
// step [1] on cpu
// src_cache_indirection [local_batch_size, beam_width, max_seq_len]
// max_input_length [1] on cpu
// input_lengths [local_batch_size * beam_width]
// ite [1] on cpu
// beam_search_diversity_rate [1] on cpu, optional
// temperature [1] on cpu, optional
// len_penalty [1] on cpu, optional
// repetition_penalty [1] on cpu, optional
// output_tensors:
// output_ids [max_seq_len, batch_size, beam_width]
// finished [local_batch_size * beam_width]
// cum_log_probs [local_batch_size * beam_width]
// parent_ids [max_seq_len, batch_size * beam_width]
// sequence_length [local_batch_size * beam_width]
// tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
// output_log_probs [max_seq_len, batch_size * beam_width], optional
// beam_hyps, optional
FT_CHECK
(
input_tensors
->
size
()
>=
7
);
FT_CHECK
(
output_tensors
->
size
()
>=
6
);
const
int
batch_size
=
output_tensors
->
at
(
"output_ids"
).
shape
[
1
];
const
int
beam_width
=
output_tensors
->
at
(
"output_ids"
).
shape
[
2
];
const
int
step
=
input_tensors
->
at
(
"step"
).
getVal
<
int
>
();
const
int
ite
=
input_tensors
->
at
(
"ite"
).
getVal
<
int
>
();
const
int
local_batch_size
=
input_tensors
->
at
(
"logits"
).
shape
[
0
];
const
float
diversity_rate
=
input_tensors
->
isExist
(
"beam_search_diversity_rate"
)
?
input_tensors
->
at
(
"beam_search_diversity_rate"
).
getVal
<
float
>
()
:
0.0
f
;
const
float
length_penalty
=
input_tensors
->
isExist
(
"len_penalty"
)
?
input_tensors
->
at
(
"len_penalty"
).
getVal
<
float
>
()
:
0.0
f
;
const
int
id_offset
=
step
*
batch_size
*
beam_width
+
ite
*
local_batch_size
*
beam_width
;
invokeLogProbAddCumLogProb
(
float_log_prob_buf_
,
input_tensors
->
at
(
"logits"
).
getPtr
<
T
>
(),
output_tensors
->
at
(
"cum_log_probs"
).
getPtr
<
float
>
(),
input_tensors
->
at
(
"end_id"
).
getPtr
<
const
int
>
(),
output_tensors
->
at
(
"finished"
).
getPtr
<
bool
>
(),
local_batch_size
*
beam_width
,
beam_width
,
vocab_size_padded_
,
stream_
);
sync_check_cuda_error
();
BeamHypotheses
beam_hyps
;
if
(
output_tensors
->
isExist
(
"beam_hyps"
)
&&
diversity_rate
==
0.0
f
)
{
beam_hyps
=
*
((
BeamHypotheses
*
)(
output_tensors
->
at
(
"beam_hyps"
).
getPtr
<
void
>
()));
beam_hyps
.
step
=
step
;
beam_hyps
.
ite
=
ite
;
beam_hyps
.
local_batch_size
=
local_batch_size
;
beam_hyps
.
batch_size
=
output_tensors
->
at
(
"output_ids"
).
shape
[
1
];
beam_hyps
.
max_seq_len
=
output_tensors
->
at
(
"output_ids"
).
shape
[
0
];
beam_hyps
.
output_ids_src
=
output_tensors
->
at
(
"output_ids"
).
getPtr
<
int
>
();
beam_hyps
.
parent_ids_src
=
output_tensors
->
at
(
"parent_ids"
).
getPtr
<
int
>
();
beam_hyps
.
sequence_lengths_src
=
output_tensors
->
at
(
"sequence_length"
).
getPtr
<
int
>
();
beam_hyps
.
length_penalty
=
length_penalty
;
}
invokeTopkBeamSearch
<
float
>
(
topk_softmax_workspace_
,
topk_softmax_workspace_size_
,
float_log_prob_buf_
,
output_tensors
->
at
(
"output_ids"
).
getPtrWithOffset
<
int
>
(
id_offset
),
&
beam_hyps
,
output_tensors
->
at
(
"finished"
).
getPtr
<
bool
>
(),
output_tensors
->
isExist
(
"sequence_length"
)
?
output_tensors
->
at
(
"sequence_length"
).
getPtr
<
int
>
()
:
(
int
*
)
nullptr
,
local_batch_size
,
beam_width
,
vocab_size_padded_
,
diversity_rate
,
length_penalty
,
input_tensors
->
at
(
"end_id"
).
getPtr
<
const
int
>
(),
stream_
);
sync_check_cuda_error
();
invokeUpdateStates
(
float_log_prob_buf_
,
output_tensors
->
at
(
"cum_log_probs"
).
getPtr
<
float
>
(),
output_tensors
->
getPtrWithOffset
<
float
>
(
"output_log_probs"
,
id_offset
,
nullptr
),
output_tensors
->
at
(
"finished"
).
getPtr
<
bool
>
(),
output_tensors
->
at
(
"parent_ids"
).
getPtrWithOffset
<
int
>
(
id_offset
),
output_tensors
->
at
(
"sequence_length"
).
getPtr
<
int
>
(),
output_tensors
->
at
(
"output_ids"
).
getPtrWithOffset
<
int
>
(
id_offset
),
output_tensors
->
at
(
"output_ids"
).
getPtrWithOffset
<
int
>
(
id_offset
),
&
beam_hyps
,
local_batch_size
,
beam_width
,
vocab_size_padded_
,
input_tensors
->
at
(
"end_id"
).
getPtr
<
const
int
>
(),
stream_
);
sync_check_cuda_error
();
}
template
<
typename
T
>
void
BeamSearchLayer
<
T
>::
allocateBuffer
()
{
FT_CHECK
(
false
);
}
template
<
typename
T
>
void
BeamSearchLayer
<
T
>::
allocateBuffer
(
size_t
batch_size
,
size_t
beam_width
)
{
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
invokeTopkBeamSearch
<
float
>
(
nullptr
,
topk_softmax_workspace_size_
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
batch_size
,
beam_width
,
vocab_size_padded_
,
0.0
f
,
// diversity rate
0.0
f
,
// length penalty
nullptr
,
stream_
);
topk_softmax_workspace_
=
reinterpret_cast
<
float
*>
(
allocator_
->
reMalloc
(
topk_softmax_workspace_
,
topk_softmax_workspace_size_
+
sizeof
(
float
)
*
batch_size
*
beam_width
*
vocab_size_padded_
,
false
));
float_log_prob_buf_
=
(
float
*
)((
char
*
)
topk_softmax_workspace_
+
topk_softmax_workspace_size_
);
is_allocate_buffer_
=
true
;
}
template
<
typename
T
>
BeamSearchLayer
<
T
>::
BeamSearchLayer
(
size_t
max_batch_size
,
size_t
head_num
,
size_t
size_per_head
,
size_t
beam_width
,
size_t
vocab_size
,
size_t
vocab_size_padded
,
int
end_id
,
float
diversity_rate
,
float
temperature
,
float
len_penalty
,
float
repetition_penalty
,
cudaStream_t
stream
,
cublasMMWrapper
*
cublas_wrapper
,
IAllocator
*
allocator
,
bool
is_free_buffer_after_forward
)
:
BaseBeamSearchLayer
<
T
>
(
max_batch_size
,
head_num
,
size_per_head
,
beam_width
,
vocab_size
,
vocab_size_padded
,
end_id
,
diversity_rate
,
temperature
,
len_penalty
,
repetition_penalty
,
stream
,
cublas_wrapper
,
allocator
,
is_free_buffer_after_forward
)
{
}
template
<
typename
T
>
BeamSearchLayer
<
T
>::
BeamSearchLayer
(
BeamSearchLayer
<
T
>
const
&
beam_search_layer
)
:
BaseBeamSearchLayer
<
T
>
(
beam_search_layer
)
{
}
template
<
typename
T
>
BeamSearchLayer
<
T
>::~
BeamSearchLayer
()
{
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
}
template
class
BeamSearchLayer
<
float
>;
template
class
BeamSearchLayer
<
half
>;
}
// namespace turbomind
src/turbomind/layers/beam_search_layers/BeamSearchLayer.h
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/turbomind/kernels/beam_search_topk_kernels.h"
#include "src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h"
#include <float.h>
namespace
turbomind
{
template
<
typename
T
>
class
BeamSearchLayer
:
public
BaseBeamSearchLayer
<
T
>
{
private:
// meta data
using
BaseBeamSearchLayer
<
T
>::
vocab_size_
;
using
BaseBeamSearchLayer
<
T
>::
vocab_size_padded_
;
using
BaseBeamSearchLayer
<
T
>::
topk_softmax_workspace_size_
;
using
BaseBeamSearchLayer
<
T
>::
topk_softmax_workspace_
;
void
allocateBuffer
()
override
;
void
allocateBuffer
(
size_t
batch_size
,
size_t
beam_width
)
override
;
void
invokeSoftMax
(
TensorMap
*
output_tensors
,
TensorMap
*
input_tensors
)
override
;
using
BaseBeamSearchLayer
<
T
>::
stream_
;
using
BaseBeamSearchLayer
<
T
>::
is_allocate_buffer_
;
using
BaseBeamSearchLayer
<
T
>::
allocator_
;
float
*
float_log_prob_buf_
=
nullptr
;
protected:
public:
BeamSearchLayer
(
size_t
max_batch_size
,
size_t
head_num
,
size_t
size_per_head
,
size_t
beam_width
,
size_t
vocab_size
,
size_t
vocab_size_padded
,
int
end_id
,
float
diversity_rate
,
float
temperature
,
float
len_penalty
,
float
repetition_penalty
,
cudaStream_t
stream
,
cublasMMWrapper
*
cublas_wrapper
,
IAllocator
*
allocator
,
bool
is_free_buffer_after_forward
);
BeamSearchLayer
(
BeamSearchLayer
<
T
>
const
&
beam_search_layer
);
~
BeamSearchLayer
();
};
}
// namespace turbomind
src/turbomind/layers/beam_search_layers/CMakeLists.txt
deleted
100644 → 0
View file @
83697422
# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
cmake_minimum_required
(
VERSION 3.8
)
add_library
(
BaseBeamSearchLayer STATIC BaseBeamSearchLayer.cu
)
set_property
(
TARGET BaseBeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET BaseBeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
target_link_libraries
(
BaseBeamSearchLayer PUBLIC -lcudart beam_search_penalty_kernels cuda_utils
)
add_library
(
OnlineBeamSearchLayer STATIC OnlineBeamSearchLayer.cu
)
set_property
(
TARGET OnlineBeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET OnlineBeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
target_link_libraries
(
OnlineBeamSearchLayer PUBLIC -lcudart BaseBeamSearchLayer online_softmax_beamsearch_kernels
)
add_library
(
BeamSearchLayer STATIC BeamSearchLayer.cu
)
set_property
(
TARGET BeamSearchLayer PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET BeamSearchLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
target_link_libraries
(
BeamSearchLayer PUBLIC -lcudart BaseBeamSearchLayer beam_search_topk_kernels
)
src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.cu
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.h"
namespace
turbomind
{
static
const
int
SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS
=
128
;
static
const
int
MAX_K
=
4
;
template
<
typename
T
>
__global__
void
update_kernel
(
bool
*
finished
,
int
*
parent_ids
,
int
*
sequence_length
,
int
*
word_ids
,
int
*
output_ids
,
BeamHypotheses
beam_hyps
,
const
int
vocab_size
,
const
int
*
end_ids
,
const
int
local_batch_size
,
const
int
beam_width
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
local_batch_size
*
beam_width
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
batch_id
=
index
/
beam_width
;
sequence_length
[
index
]
=
finished
[
index
]
?
sequence_length
[
index
]
:
sequence_length
[
index
]
+
1
;
int
beam_id
=
(
word_ids
[
index
]
/
vocab_size
)
%
beam_width
;
int
word_id
=
word_ids
[
index
]
%
vocab_size
;
sequence_length
[
index
]
=
sequence_length
[
batch_id
*
beam_width
+
beam_id
];
finished
[
index
]
=
word_id
==
end_ids
[
index
/
beam_width
]
?
1
:
0
;
parent_ids
[
index
]
=
beam_id
;
word_ids
[
index
]
=
word_id
;
output_ids
[
index
]
=
word_id
;
if
(
beam_hyps
.
num_beams
!=
nullptr
)
{
if
(
beam_hyps
.
num_beams
[
beam_hyps
.
ite
*
beam_hyps
.
local_batch_size
+
batch_id
]
==
beam_width
)
{
for
(
int
i
=
0
;
i
<
beam_width
;
i
++
)
{
finished
[
batch_id
*
beam_width
+
i
]
=
true
;
}
}
}
}
}
void
invokeUpdate
(
bool
*
finished
,
int
*
parent_ids
,
int
*
sequence_length
,
int
*
word_ids
,
int
*
output_ids
,
BeamHypotheses
*
beam_hyps
,
const
int
local_batch_size
,
const
int
beam_width
,
const
int
vocab_size_padded
,
const
int
*
end_ids
,
cudaStream_t
stream
)
{
dim3
grid
((
int
)
ceil
(
local_batch_size
*
beam_width
*
1.0
/
256
));
dim3
block
(
256
);
update_kernel
<
float
><<<
grid
,
block
,
0
,
stream
>>>
(
finished
,
parent_ids
,
sequence_length
,
word_ids
,
output_ids
,
*
beam_hyps
,
vocab_size_padded
,
end_ids
,
local_batch_size
,
beam_width
);
}
template
<
typename
T
>
void
OnlineBeamSearchLayer
<
T
>::
invokeSoftMax
(
TensorMap
*
output_tensors
,
TensorMap
*
input_tensors
)
{
// input_tensors:
// logits [local_batch_size, beam_width, vocab_size_padded]
// embedding_bias [vocab_size_padded]
// step [1] on cpu
// src_cache_indirection [local_batch_size, beam_width, max_seq_len]
// max_input_length [1] on cpu
// input_lengths [local_batch_size * beam_width]
// ite [1] on cpu
// beam_search_diversity_rate [1] on cpu, optional
// temperature [1] on cpu, optional
// len_penalty [1] on cpu, optional
// repetition_penalty [1] on cpu, optional
// output_tensors:
// output_ids [max_seq_len, batch_size, beam_width]
// finished [local_batch_size * beam_width]
// cum_log_probs [local_batch_size * beam_width]
// parent_ids [max_seq_len, batch_size * beam_width]
// sequence_length [local_batch_size * beam_width]
// tgt_cache_indirection [local_batch_size, beam_width, max_seq_len]
// output_log_probs [max_seq_len, batch_size, beam_width]
FT_CHECK
(
input_tensors
->
size
()
>=
7
);
FT_CHECK
(
output_tensors
->
size
()
>=
6
);
const
int
batch_size
=
output_tensors
->
at
(
"output_ids"
).
shape
[
1
];
const
int
beam_width
=
output_tensors
->
at
(
"output_ids"
).
shape
[
2
];
const
int
step
=
input_tensors
->
at
(
"step"
).
getVal
<
int
>
();
const
int
ite
=
input_tensors
->
at
(
"ite"
).
getVal
<
int
>
();
const
int
local_batch_size
=
input_tensors
->
at
(
"logits"
).
shape
[
0
];
const
float
diversity_rate
=
input_tensors
->
isExist
(
"beam_search_diversity_rate"
)
?
input_tensors
->
at
(
"beam_search_diversity_rate"
).
getVal
<
float
>
()
:
0.0
f
;
const
float
length_penalty
=
input_tensors
->
isExist
(
"len_penalty"
)
?
input_tensors
->
at
(
"len_penalty"
).
getVal
<
float
>
()
:
0.0
f
;
const
int
id_offset
=
step
*
batch_size
*
beam_width
+
local_batch_size
*
ite
*
beam_width
;
BeamHypotheses
beam_hyps
;
if
(
output_tensors
->
isExist
(
"beam_hyps"
))
{
beam_hyps
=
*
((
BeamHypotheses
*
)(
output_tensors
->
at
(
"beam_hyps"
).
getPtr
<
void
>
()));
beam_hyps
.
step
=
step
;
beam_hyps
.
ite
=
ite
;
beam_hyps
.
local_batch_size
=
local_batch_size
;
beam_hyps
.
batch_size
=
output_tensors
->
at
(
"output_ids"
).
shape
[
1
];
beam_hyps
.
max_seq_len
=
output_tensors
->
at
(
"output_ids"
).
shape
[
0
];
beam_hyps
.
output_ids_src
=
output_tensors
->
at
(
"output_ids"
).
getPtr
<
int
>
();
beam_hyps
.
parent_ids_src
=
output_tensors
->
at
(
"parent_ids"
).
getPtr
<
int
>
();
beam_hyps
.
sequence_lengths_src
=
output_tensors
->
at
(
"sequence_length"
).
getPtr
<
int
>
();
beam_hyps
.
log_probs_src
=
output_tensors
->
getPtr
<
float
>
(
"output_log_probs"
,
nullptr
);
beam_hyps
.
length_penalty
=
length_penalty
;
beam_hyps
.
end_ids
=
input_tensors
->
at
(
"end_id"
).
getPtr
<
int
>
();
}
invokeTopkSoftMax
(
input_tensors
->
at
(
"logits"
).
getPtr
<
T
>
(),
(
const
T
*
)(
nullptr
),
output_tensors
->
at
(
"finished"
).
getPtr
<
bool
>
(),
output_tensors
->
at
(
"sequence_length"
).
getPtr
<
int
>
(),
output_tensors
->
at
(
"cum_log_probs"
).
getPtr
<
float
>
(),
output_tensors
->
getPtrWithOffset
<
float
>
(
"output_log_probs"
,
id_offset
,
nullptr
),
output_tensors
->
at
(
"output_ids"
).
getPtrWithOffset
<
int
>
(
id_offset
),
topk_softmax_workspace_
,
topk_softmax_workspace_size_
,
&
beam_hyps
,
local_batch_size
,
beam_width
,
vocab_size_padded_
,
input_tensors
->
at
(
"end_id"
).
getPtr
<
int
>
(),
diversity_rate
,
length_penalty
,
stream_
);
sync_check_cuda_error
();
invokeUpdate
(
output_tensors
->
at
(
"finished"
).
getPtr
<
bool
>
(),
output_tensors
->
at
(
"parent_ids"
).
getPtrWithOffset
<
int
>
(
id_offset
),
output_tensors
->
at
(
"sequence_length"
).
getPtr
<
int
>
(),
output_tensors
->
at
(
"output_ids"
).
getPtrWithOffset
<
int
>
(
id_offset
),
output_tensors
->
at
(
"output_ids"
).
getPtrWithOffset
<
int
>
(
id_offset
),
&
beam_hyps
,
local_batch_size
,
beam_width
,
vocab_size_padded_
,
input_tensors
->
at
(
"end_id"
).
getPtr
<
const
int
>
(),
stream_
);
sync_check_cuda_error
();
}
template
<
typename
T
>
void
OnlineBeamSearchLayer
<
T
>::
allocateBuffer
()
{
FT_CHECK
(
false
);
}
template
<
typename
T
>
void
OnlineBeamSearchLayer
<
T
>::
allocateBuffer
(
size_t
batch_size
,
size_t
beam_width
)
{
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
// we need to check 2 * beam_width candidates each time
// 64 is the max beam width we support now.
topk_softmax_workspace_size_
=
(
size_t
)(
ceil
(
batch_size
*
64
*
(
64
*
2
)
/
4.
)
*
4
*
2
+
ceil
(
batch_size
*
(
64
*
2
)
*
SMALL_TOP_K_SOFTMAX_MAX_VOC_PARTS
*
(
2
*
(
MAX_K
*
2
)
+
2
)
/
4.
)
*
4
);
topk_softmax_workspace_
=
reinterpret_cast
<
float
*>
(
allocator_
->
reMalloc
(
topk_softmax_workspace_
,
sizeof
(
float
)
*
topk_softmax_workspace_size_
,
true
));
is_allocate_buffer_
=
true
;
}
template
<
typename
T
>
OnlineBeamSearchLayer
<
T
>::
OnlineBeamSearchLayer
(
size_t
max_batch_size
,
size_t
head_num
,
size_t
size_per_head
,
size_t
beam_width
,
size_t
vocab_size
,
size_t
vocab_size_padded
,
int
end_id
,
float
diversity_rate
,
float
temperature
,
float
len_penalty
,
float
repetition_penalty
,
cudaStream_t
stream
,
cublasMMWrapper
*
cublas_wrapper
,
IAllocator
*
allocator
,
bool
is_free_buffer_after_forward
)
:
BaseBeamSearchLayer
<
T
>
(
max_batch_size
,
head_num
,
size_per_head
,
beam_width
,
vocab_size
,
vocab_size_padded
,
end_id
,
diversity_rate
,
temperature
,
len_penalty
,
repetition_penalty
,
stream
,
cublas_wrapper
,
allocator
,
is_free_buffer_after_forward
)
{
}
template
<
typename
T
>
OnlineBeamSearchLayer
<
T
>::
OnlineBeamSearchLayer
(
OnlineBeamSearchLayer
<
T
>
const
&
beam_search_layer
)
:
BaseBeamSearchLayer
<
T
>
(
beam_search_layer
)
{
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
}
template
<
typename
T
>
OnlineBeamSearchLayer
<
T
>::~
OnlineBeamSearchLayer
()
{
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
}
template
class
OnlineBeamSearchLayer
<
float
>;
template
class
OnlineBeamSearchLayer
<
half
>;
}
// namespace turbomind
src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.h
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/turbomind/kernels/online_softmax_beamsearch_kernels.h"
#include "src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h"
namespace
turbomind
{
template
<
typename
T
>
class
OnlineBeamSearchLayer
:
public
BaseBeamSearchLayer
<
T
>
{
private:
// meta data
using
BaseBeamSearchLayer
<
T
>::
vocab_size_
;
using
BaseBeamSearchLayer
<
T
>::
vocab_size_padded_
;
using
BaseBeamSearchLayer
<
T
>::
topk_softmax_workspace_size_
;
using
BaseBeamSearchLayer
<
T
>::
topk_softmax_workspace_
;
void
allocateBuffer
()
override
;
void
allocateBuffer
(
size_t
batch_size
,
size_t
beam_width
)
override
;
void
invokeSoftMax
(
TensorMap
*
output_tensors
,
TensorMap
*
input_tensors
)
override
;
using
BaseBeamSearchLayer
<
T
>::
stream_
;
using
BaseBeamSearchLayer
<
T
>::
is_allocate_buffer_
;
using
BaseBeamSearchLayer
<
T
>::
allocator_
;
protected:
public:
OnlineBeamSearchLayer
(
size_t
max_batch_size
,
size_t
head_num
,
size_t
size_per_head
,
size_t
beam_width
,
size_t
vocab_size
,
size_t
vocab_size_padded
,
int
end_id
,
float
diversity_rate
,
float
temperature
,
float
len_penalty
,
float
repetition_penalty
,
cudaStream_t
stream
,
cublasMMWrapper
*
cublas_wrapper
,
IAllocator
*
allocator
,
bool
is_free_buffer_after_forward
);
OnlineBeamSearchLayer
(
OnlineBeamSearchLayer
<
T
>
const
&
beam_search_layer
);
~
OnlineBeamSearchLayer
();
};
}
// namespace turbomind
src/turbomind/models/llama/CMakeLists.txt
View file @
981a4610
...
@@ -23,7 +23,6 @@ set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
...
@@ -23,7 +23,6 @@ set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries
(
Llama PUBLIC -lcudart
target_link_libraries
(
Llama PUBLIC -lcudart
cublasMMWrapper
cublasMMWrapper
DynamicDecodeLayer
DynamicDecodeLayer
BaseBeamSearchLayer
activation_kernels
activation_kernels
decoder_masked_multihead_attention
decoder_masked_multihead_attention
bert_preprocess_kernels
bert_preprocess_kernels
...
...
src/turbomind/models/llama/prefix_cache.cu
deleted
100644 → 0
View file @
83697422
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/turbomind/models/llama/prefix_cache.h"
// <L,H,D/X,s,X> -> <L,H,D/X,S[:s],X>
template
<
typename
T
>
__global__
void
insertKeyCache
(
T
*
key_cache
,
const
T
*
src
,
int
L
,
int
H
,
int
Dx
,
int
s
,
int
X
,
size_t
S
)
{
for
(
int
i
=
threadIdx
.
x
;
i
<
L
*
H
*
Dx
*
s
*
X
;
i
+=
blockDim
.
x
)
{
int
i0
=
i
/
X
;
int
x
=
i
%
X
;
int
i1
=
i0
/
s
;
int
t
=
i0
%
s
;
size_t
j
=
(
i1
*
S
+
t
)
*
X
+
x
;
key_cache
[
j
]
=
src
[
i
];
}
}
template
<
typename
T
>
void
invokeInsertKeyCache
(
T
*
key_cache
,
const
T
*
src
,
int
L
,
int
H
,
int
Dx
,
int
s
,
int
X
,
int
S
,
cudaStream_t
st
)
{
insertKeyCache
<<<
1
,
512
,
0
,
st
>>>
(
key_cache
,
src
,
L
,
H
,
Dx
,
s
,
X
,
S
);
}
template
void
invokeInsertKeyCache
(
float
*
key_cache
,
const
float
*
src
,
int
L
,
int
H
,
int
Dx
,
int
s
,
int
X
,
int
S
,
cudaStream_t
st
);
template
void
invokeInsertKeyCache
(
half
*
key_cache
,
const
half
*
src
,
int
L
,
int
H
,
int
Dx
,
int
s
,
int
X
,
int
S
,
cudaStream_t
st
);
// <L,H,s,D> -> <L,H,S[:s],D>
template
<
typename
T
>
__global__
void
insertValueCache
(
T
*
value_cache
,
const
T
*
src
,
int
L
,
int
H
,
int
s
,
int
D
,
size_t
S
)
{
for
(
int
i
=
threadIdx
.
x
;
i
<
L
*
H
*
s
*
D
;
i
+=
blockDim
.
x
)
{
int
i0
=
i
/
D
;
int
d
=
i
%
D
;
int
i1
=
i0
/
s
;
int
t
=
i0
%
s
;
size_t
j
=
(
i1
*
S
+
t
)
*
D
+
d
;
value_cache
[
j
]
=
src
[
i
];
}
}
template
<
typename
T
>
void
invokeInsertValueCache
(
T
*
value_cache
,
const
T
*
src
,
int
L
,
int
H
,
int
s
,
int
D
,
int
S
,
cudaStream_t
st
)
{
insertValueCache
<<<
1
,
512
,
0
,
st
>>>
(
value_cache
,
src
,
L
,
H
,
s
,
D
,
S
);
}
template
void
invokeInsertValueCache
(
float
*
value_cache
,
const
float
*
src
,
int
L
,
int
H
,
int
s
,
int
D
,
int
S
,
cudaStream_t
st
);
template
void
invokeInsertValueCache
(
half
*
value_cache
,
const
half
*
src
,
int
L
,
int
H
,
int
s
,
int
D
,
int
S
,
cudaStream_t
st
);
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment