Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Lmdeploy
Commits
981a4610
Unverified
Commit
981a4610
authored
Jul 31, 2023
by
Li Zhang
Committed by
GitHub
Jul 31, 2023
Browse files
[Fix] Remove unused code to reduce binary size (#181)
* clean-up * fix lint * fix lint
parent
83697422
Changes
41
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
89 additions
and
3862 deletions
+89
-3862
CMakeLists.txt
CMakeLists.txt
+0
-6
src/turbomind/kernels/CMakeLists.txt
src/turbomind/kernels/CMakeLists.txt
+0
-18
src/turbomind/kernels/activation_kernels.cu
src/turbomind/kernels/activation_kernels.cu
+11
-342
src/turbomind/kernels/beam_search_penalty_kernels.cu
src/turbomind/kernels/beam_search_penalty_kernels.cu
+0
-313
src/turbomind/kernels/beam_search_penalty_kernels.h
src/turbomind/kernels/beam_search_penalty_kernels.h
+0
-48
src/turbomind/kernels/beam_search_topk_kernels.cu
src/turbomind/kernels/beam_search_topk_kernels.cu
+0
-845
src/turbomind/kernels/beam_search_topk_kernels.h
src/turbomind/kernels/beam_search_topk_kernels.h
+0
-94
src/turbomind/kernels/bert_preprocess_kernels.cu
src/turbomind/kernels/bert_preprocess_kernels.cu
+0
-293
src/turbomind/kernels/bert_preprocess_kernels.h
src/turbomind/kernels/bert_preprocess_kernels.h
+0
-58
src/turbomind/kernels/decoder_masked_multihead_attention.h
src/turbomind/kernels/decoder_masked_multihead_attention.h
+0
-1
src/turbomind/kernels/decoding_kernels.cu
src/turbomind/kernels/decoding_kernels.cu
+49
-460
src/turbomind/kernels/decoding_kernels.h
src/turbomind/kernels/decoding_kernels.h
+0
-76
src/turbomind/kernels/gen_relative_pos_bias.cu
src/turbomind/kernels/gen_relative_pos_bias.cu
+0
-304
src/turbomind/kernels/gen_relative_pos_bias.h
src/turbomind/kernels/gen_relative_pos_bias.h
+0
-56
src/turbomind/kernels/logprob_kernels.cu
src/turbomind/kernels/logprob_kernels.cu
+25
-25
src/turbomind/kernels/online_softmax_beamsearch_kernels.cu
src/turbomind/kernels/online_softmax_beamsearch_kernels.cu
+0
-739
src/turbomind/kernels/online_softmax_beamsearch_kernels.h
src/turbomind/kernels/online_softmax_beamsearch_kernels.h
+0
-41
src/turbomind/layers/CMakeLists.txt
src/turbomind/layers/CMakeLists.txt
+2
-5
src/turbomind/layers/DynamicDecodeLayer.cc
src/turbomind/layers/DynamicDecodeLayer.cc
+2
-135
src/turbomind/layers/DynamicDecodeLayer.h
src/turbomind/layers/DynamicDecodeLayer.h
+0
-3
No files found.
CMakeLists.txt
View file @
981a4610
...
...
@@ -299,21 +299,16 @@ endif()
########################################
add_library
(
transformer-shared SHARED
$<TARGET_OBJECTS:BaseBeamSearchLayer>
$<TARGET_OBJECTS:BaseSamplingLayer>
$<TARGET_OBJECTS:BeamSearchLayer>
$<TARGET_OBJECTS:DynamicDecodeLayer>
$<TARGET_OBJECTS:llama_fmha>
$<TARGET_OBJECTS:Llama>
$<TARGET_OBJECTS:LlamaTritonBackend>
$<TARGET_OBJECTS:OnlineBeamSearchLayer>
$<TARGET_OBJECTS:TopKSamplingLayer>
$<TARGET_OBJECTS:TopPSamplingLayer>
$<TARGET_OBJECTS:TransformerTritonBackend>
$<TARGET_OBJECTS:activation_kernels>
$<TARGET_OBJECTS:ban_bad_words>
$<TARGET_OBJECTS:beam_search_penalty_kernels>
$<TARGET_OBJECTS:beam_search_topk_kernels>
$<TARGET_OBJECTS:bert_preprocess_kernels>
$<TARGET_OBJECTS:cublasAlgoMap>
$<TARGET_OBJECTS:cublasMMWrapper>
...
...
@@ -329,7 +324,6 @@ add_library(transformer-shared SHARED
$<TARGET_OBJECTS:mpi_utils>
$<TARGET_OBJECTS:nccl_utils>
$<TARGET_OBJECTS:nvtx_utils>
$<TARGET_OBJECTS:online_softmax_beamsearch_kernels>
$<TARGET_OBJECTS:sampling_penalty_kernels>
$<TARGET_OBJECTS:sampling_topk_kernels>
$<TARGET_OBJECTS:sampling_topp_kernels>
...
...
src/turbomind/kernels/CMakeLists.txt
View file @
981a4610
...
...
@@ -26,11 +26,6 @@ add_library(activation_kernels STATIC activation_kernels.cu)
set_property
(
TARGET activation_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET activation_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
add_library
(
gen_relative_pos_bias STATIC gen_relative_pos_bias.cu
)
set_property
(
TARGET gen_relative_pos_bias PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET gen_relative_pos_bias PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
target_link_libraries
(
gen_relative_pos_bias PUBLIC activation_kernels
)
add_library
(
logprob_kernels STATIC logprob_kernels.cu
)
set_property
(
TARGET logprob_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET logprob_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
...
...
@@ -51,10 +46,6 @@ add_library(decoder_masked_multihead_attention STATIC ${decoder_masked_multihead
set_property
(
TARGET decoder_masked_multihead_attention PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET decoder_masked_multihead_attention PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
add_library
(
online_softmax_beamsearch_kernels STATIC online_softmax_beamsearch_kernels.cu
)
set_property
(
TARGET online_softmax_beamsearch_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET online_softmax_beamsearch_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
add_library
(
decoding_kernels STATIC decoding_kernels.cu
)
set_property
(
TARGET decoding_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET decoding_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
...
...
@@ -63,15 +54,6 @@ add_library(gpt_kernels STATIC gpt_kernels.cu)
set_property
(
TARGET gpt_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET gpt_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
add_library
(
beam_search_penalty_kernels STATIC beam_search_penalty_kernels.cu
)
set_property
(
TARGET beam_search_penalty_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET beam_search_penalty_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
target_link_libraries
(
beam_search_penalty_kernels PRIVATE cuda_utils
)
add_library
(
beam_search_topk_kernels STATIC beam_search_topk_kernels.cu
)
set_property
(
TARGET beam_search_topk_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET beam_search_topk_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
add_library
(
sampling_topk_kernels STATIC sampling_topk_kernels.cu
)
set_property
(
TARGET sampling_topk_kernels PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET sampling_topk_kernels PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
...
...
src/turbomind/kernels/activation_kernels.cu
View file @
981a4610
...
...
@@ -306,17 +306,17 @@ void invokeGenericActivation(T* out,
const int seq_len, \
cudaStream_t stream);
INSTANTIATE_GENERIC_ACTIVATION
(
GeluActivation
,
float
,
float
);
INSTANTIATE_GENERIC_ACTIVATION
(
GeluActivation
,
half
,
half
);
#ifdef ENABLE_BF16
INSTANTIATE_GENERIC_ACTIVATION
(
GeluActivation
,
__nv_bfloat16
,
__nv_bfloat16
);
#endif
INSTANTIATE_GENERIC_ACTIVATION
(
ReluActivation
,
float
,
float
);
INSTANTIATE_GENERIC_ACTIVATION
(
ReluActivation
,
half
,
half
);
#ifdef ENABLE_BF16
INSTANTIATE_GENERIC_ACTIVATION
(
ReluActivation
,
__nv_bfloat16
,
__nv_bfloat16
);
#endif
//
INSTANTIATE_GENERIC_ACTIVATION(GeluActivation, float, float);
//
INSTANTIATE_GENERIC_ACTIVATION(GeluActivation, half, half);
//
#ifdef ENABLE_BF16
//
INSTANTIATE_GENERIC_ACTIVATION(GeluActivation, __nv_bfloat16, __nv_bfloat16);
//
#endif
//
INSTANTIATE_GENERIC_ACTIVATION(ReluActivation, float, float);
//
INSTANTIATE_GENERIC_ACTIVATION(ReluActivation, half, half);
//
#ifdef ENABLE_BF16
//
INSTANTIATE_GENERIC_ACTIVATION(ReluActivation, __nv_bfloat16, __nv_bfloat16);
//
#endif
INSTANTIATE_GENERIC_ACTIVATION
(
SiluActivation
,
float
,
float
);
INSTANTIATE_GENERIC_ACTIVATION
(
SiluActivation
,
half
,
half
);
...
...
@@ -324,335 +324,4 @@ INSTANTIATE_GENERIC_ACTIVATION(SiluActivation, half, half);
INSTANTIATE_GENERIC_ACTIVATION
(
SiluActivation
,
__nv_bfloat16
,
__nv_bfloat16
);
#endif
INSTANTIATE_GENERIC_ACTIVATION
(
IdentityActivation
,
float
,
float
);
INSTANTIATE_GENERIC_ACTIVATION
(
IdentityActivation
,
half
,
half
);
INSTANTIATE_GENERIC_ACTIVATION
(
IdentityActivation
,
float
,
half
);
#ifdef ENABLE_BF16
INSTANTIATE_GENERIC_ACTIVATION
(
IdentityActivation
,
__nv_bfloat16
,
__nv_bfloat16
);
INSTANTIATE_GENERIC_ACTIVATION
(
IdentityActivation
,
float
,
__nv_bfloat16
);
#endif
#undef INSTANCIATE_GENERIC_ACTIVATION
template
<
typename
T
>
__global__
void
add_bias_tanh
(
T
*
out
,
const
T
*
__restrict
bias
,
int
m
,
int
n
)
{
for
(
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
id
<
m
*
n
;
id
+=
blockDim
.
x
*
gridDim
.
x
)
{
T
val
=
out
[
id
];
if
(
bias
!=
nullptr
)
{
val
=
val
+
ldg
(
&
bias
[
id
%
n
]);
}
out
[
id
]
=
tanhf
(
val
);
}
}
template
<
>
__global__
void
add_bias_tanh
(
half
*
out
,
const
half
*
__restrict
bias
,
int
m
,
int
n
)
{
half2
*
out_ptr
=
(
half2
*
)
out
;
const
half2
*
bias_ptr
=
(
half2
*
)
bias
;
for
(
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
id
<
m
*
n
;
id
+=
blockDim
.
x
*
gridDim
.
x
)
{
half2
val
=
out_ptr
[
id
];
if
(
bias
!=
nullptr
)
{
val
=
val
+
__ldg
(
&
bias_ptr
[
id
%
n
]);
}
val
.
x
=
tanhf
(
val
.
x
);
val
.
y
=
tanhf
(
val
.
y
);
out_ptr
[
id
]
=
val
;
}
}
#ifdef ENABLE_BF16
template
<
>
__global__
void
add_bias_tanh
(
__nv_bfloat16
*
out
,
const
__nv_bfloat16
*
__restrict
bias
,
int
m
,
int
n
)
{
__nv_bfloat162
*
out_ptr
=
(
__nv_bfloat162
*
)
out
;
const
__nv_bfloat162
*
bias_ptr
=
(
__nv_bfloat162
*
)
bias
;
for
(
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
id
<
m
*
n
;
id
+=
blockDim
.
x
*
gridDim
.
x
)
{
__nv_bfloat162
val
=
out_ptr
[
id
];
if
(
bias
!=
nullptr
)
{
val
=
bf16hadd2
(
val
,
ldg
(
&
bias_ptr
[
id
%
n
]));
}
val
.
x
=
tanhf
(
val
.
x
);
val
.
y
=
tanhf
(
val
.
y
);
out_ptr
[
id
]
=
val
;
}
}
#endif
template
<
typename
T
>
void
invokeAddBiasTanh
(
T
*
out
,
const
T
*
bias
,
const
int
m
,
const
int
n
,
cudaStream_t
stream
)
{
const
int
data_type_factor
=
4
/
sizeof
(
T
);
// 1 for fp32, 2 for fp16 and bf16
dim3
block
,
grid
;
if
(
n
/
4
/
data_type_factor
<=
1024
)
{
block
.
x
=
n
/
4
/
data_type_factor
;
grid
.
x
=
m
;
}
else
{
block
.
x
=
1024
;
grid
.
x
=
ceil
(
m
*
n
/
1024.
);
}
add_bias_tanh
<
T
><<<
grid
,
block
,
0
,
stream
>>>
(
out
,
bias
,
m
,
n
/
data_type_factor
);
}
template
void
invokeAddBiasTanh
(
float
*
out
,
const
float
*
bias
,
const
int
m
,
const
int
n
,
cudaStream_t
stream
);
template
void
invokeAddBiasTanh
(
half
*
out
,
const
half
*
bias
,
const
int
m
,
const
int
n
,
cudaStream_t
stream
);
#ifdef ENABLE_BF16
template
void
invokeAddBiasTanh
(
__nv_bfloat16
*
out
,
const
__nv_bfloat16
*
bias
,
const
int
m
,
const
int
n
,
cudaStream_t
stream
);
#endif
template
<
typename
T2
,
int
N
>
__global__
void
addBiasGeluV2
(
T2
*
out
,
const
T2
*
__restrict
bias
,
const
int
*
ia3_tasks
,
const
T2
*
ia3_weights
,
const
int
size
,
const
int
*
padding_offset
,
const
int
seq_len
)
{
const
bool
with_ia3
=
ia3_tasks
!=
nullptr
;
for
(
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
id
<
size
;
id
+=
blockDim
.
x
*
gridDim
.
x
)
{
T2
val
=
out
[
id
];
if
(
bias
!=
nullptr
)
{
T2
reg_bias
=
ldg
(
&
bias
[
id
%
N
]);
val
=
hadd2
(
val
,
reg_bias
);
}
val
=
GeluActivation
<
T2
>::
apply
(
val
);
if
(
with_ia3
)
{
const
int
word_id
=
id
/
N
;
const
int
offset
=
padding_offset
==
nullptr
?
0
:
padding_offset
[
word_id
];
const
int
batch_id
=
(
word_id
+
offset
)
/
seq_len
;
const
int
task
=
ia3_tasks
[
batch_id
];
val
=
val
*
ia3_weights
[
task
*
N
+
(
id
%
N
)];
}
out
[
id
]
=
val
;
}
}
template
<
typename
T2
,
int
N
,
int
ELEMENT_PER_ROUND
>
__global__
void
addBiasGeluV3
(
T2
*
out
,
const
T2
*
__restrict
bias
,
const
int
*
ia3_tasks
,
const
T2
*
ia3_weights
,
const
int
size
,
const
int
*
padding_offset
,
const
int
seq_len
)
{
const
bool
with_ia3
=
ia3_tasks
!=
nullptr
;
T2
buffer
[
ELEMENT_PER_ROUND
];
T2
tmp_bias
[
ELEMENT_PER_ROUND
];
for
(
int
id
=
blockIdx
.
x
*
blockDim
.
x
*
ELEMENT_PER_ROUND
+
threadIdx
.
x
*
ELEMENT_PER_ROUND
;
id
<
size
;
id
+=
blockDim
.
x
*
gridDim
.
x
*
ELEMENT_PER_ROUND
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
ELEMENT_PER_ROUND
;
i
++
)
{
buffer
[
i
]
=
out
[
id
+
i
];
if
(
bias
!=
nullptr
)
{
tmp_bias
[
i
]
=
ldg
(
&
bias
[(
id
+
i
)
%
N
]);
}
}
#pragma unroll
for
(
int
i
=
0
;
i
<
ELEMENT_PER_ROUND
;
i
++
)
{
if
(
bias
!=
nullptr
)
{
buffer
[
i
]
=
hadd2
(
buffer
[
i
],
tmp_bias
[
i
]);
}
buffer
[
i
]
=
GeluActivation
<
T2
>::
apply
(
buffer
[
i
]);
if
(
with_ia3
)
{
const
int
word_id
=
(
id
+
i
)
/
N
;
const
int
offset
=
padding_offset
==
nullptr
?
0
:
padding_offset
[
word_id
];
const
int
batch_id
=
(
word_id
+
offset
)
/
seq_len
;
const
int
task
=
ia3_tasks
[
batch_id
];
buffer
[
i
]
=
buffer
[
i
]
*
ia3_weights
[
task
*
N
+
((
id
+
i
)
%
N
)];
}
out
[
id
+
i
]
=
buffer
[
i
];
}
}
}
#define ADD_BIAS_GELU(HALF_N, ELEMENT_PER_ROUND) \
case HALF_N: \
if (ELEMENT_PER_ROUND > 1) { \
grid.x = grid.x / ELEMENT_PER_ROUND; \
addBiasGeluV3<T2, HALF_N, ELEMENT_PER_ROUND><<<grid, block, 0, stream>>>( \
(T2*)out, (const T2*)bias, ia3_tasks, (T2*)ia3_weights, m * half_n, padding_offset, seq_len); \
} \
else { \
addBiasGeluV2<T2, HALF_N><<<grid, block, 0, stream>>>( \
(T2*)out, (const T2*)bias, ia3_tasks, (T2*)ia3_weights, m * half_n, padding_offset, seq_len); \
} \
break;
template
<
typename
T
>
void
invokeAddBiasGeluV2
(
T
*
out
,
const
T
*
bias
,
const
int
*
ia3_tasks
,
const
T
*
ia3_weights
,
const
int
*
padding_offset
,
const
int
seq_len
,
const
int
m
,
const
int
n
,
cudaStream_t
stream
)
{
if
(
n
%
2
==
0
&&
sizeof
(
T
)
==
2
)
{
const
int
half_n
=
n
/
2
;
dim3
block
,
grid
;
block
.
x
=
std
::
min
(
half_n
,
512
);
grid
.
x
=
(
m
*
half_n
+
(
block
.
x
-
1
))
/
block
.
x
;
using
T2
=
typename
TypeConverter
<
T
>::
Type
;
if
(
grid
.
x
>=
512
)
{
switch
(
half_n
)
{
ADD_BIAS_GELU
(
256
,
1
)
ADD_BIAS_GELU
(
512
,
1
)
ADD_BIAS_GELU
(
1024
,
1
)
ADD_BIAS_GELU
(
1536
,
1
)
ADD_BIAS_GELU
(
2048
,
1
)
ADD_BIAS_GELU
(
4096
,
2
)
ADD_BIAS_GELU
(
8192
,
2
)
ADD_BIAS_GELU
(
16384
,
2
)
ADD_BIAS_GELU
(
24576
,
2
)
ADD_BIAS_GELU
(
40960
,
4
)
default:
invokeGenericActivation
<
GeluActivation
>
(
out
,
bias
,
(
T
*
)
nullptr
,
(
T
*
)
nullptr
,
ia3_tasks
,
ia3_weights
,
m
,
n
,
0
,
(
float
*
)
nullptr
,
(
float
*
)
nullptr
,
padding_offset
,
seq_len
,
stream
);
break
;
}
}
else
{
switch
(
half_n
)
{
ADD_BIAS_GELU
(
256
,
1
)
ADD_BIAS_GELU
(
512
,
1
)
ADD_BIAS_GELU
(
1024
,
1
)
ADD_BIAS_GELU
(
1536
,
1
)
ADD_BIAS_GELU
(
2048
,
1
)
ADD_BIAS_GELU
(
4096
,
1
)
ADD_BIAS_GELU
(
8192
,
2
)
ADD_BIAS_GELU
(
16384
,
2
)
ADD_BIAS_GELU
(
24576
,
2
)
ADD_BIAS_GELU
(
40960
,
2
)
default:
invokeGenericActivation
<
GeluActivation
>
(
out
,
bias
,
(
T
*
)
nullptr
,
(
T
*
)
nullptr
,
ia3_tasks
,
ia3_weights
,
m
,
n
,
0
,
(
float
*
)
nullptr
,
(
float
*
)
nullptr
,
padding_offset
,
seq_len
,
stream
);
break
;
}
}
}
else
{
invokeGenericActivation
<
GeluActivation
>
(
out
,
bias
,
(
T
*
)
nullptr
,
(
T
*
)
nullptr
,
ia3_tasks
,
ia3_weights
,
m
,
n
,
0
,
(
float
*
)
nullptr
,
(
float
*
)
nullptr
,
padding_offset
,
seq_len
,
stream
);
}
}
#undef ADD_BIAS_GELU
template
void
invokeAddBiasGeluV2
(
float
*
out
,
const
float
*
bias
,
const
int
*
ia3_tasks
,
const
float
*
ia3_weights
,
const
int
*
padding_offset
,
const
int
seq_len
,
const
int
m
,
const
int
n
,
cudaStream_t
stream
);
template
void
invokeAddBiasGeluV2
(
half
*
out
,
const
half
*
bias
,
const
int
*
ia3_tasks
,
const
half
*
ia3_weights
,
const
int
*
padding_offset
,
const
int
seq_len
,
const
int
m
,
const
int
n
,
cudaStream_t
stream
);
#ifdef ENABLE_BF16
template
void
invokeAddBiasGeluV2
(
__nv_bfloat16
*
out
,
const
__nv_bfloat16
*
bias
,
const
int
*
ia3_tasks
,
const
__nv_bfloat16
*
ia3_weights
,
const
int
*
padding_offset
,
const
int
seq_len
,
const
int
m
,
const
int
n
,
cudaStream_t
stream
);
#endif // ENABLE_BF16
template
<
typename
T
>
__global__
void
sigmoid_kernel
(
T
*
data
,
const
int
size
,
const
float
scale
)
{
const
int
index
=
(
blockIdx
.
y
*
gridDim
.
x
+
blockIdx
.
x
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
size
)
{
float
val
=
cuda_cast
<
float
>
(
data
[
index
]);
val
=
1.0
f
/
(
1.0
f
+
exp
(
-
val
))
*
scale
;
data
[
index
]
=
T
(
val
);
}
}
template
<
>
__global__
void
sigmoid_kernel
(
half2
*
data
,
const
int
size
,
const
float
scale
)
{
const
int
index
=
(
blockIdx
.
y
*
gridDim
.
x
+
blockIdx
.
x
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
index
<
size
/
2
)
{
half2
val
=
data
[
index
];
float2
val_float2
=
cuda_cast
<
float2
>
(
val
);
val_float2
.
x
=
1.0
f
/
(
1.0
f
+
exp
(
-
val_float2
.
x
))
*
scale
;
val_float2
.
y
=
1.0
f
/
(
1.0
f
+
exp
(
-
val_float2
.
y
))
*
scale
;
data
[
index
]
=
cuda_cast
<
half2
>
(
val_float2
);
}
}
template
<
typename
T
>
void
invokeSigmoid
(
T
*
data
,
const
int
size
,
const
float
scale
,
cudaStream_t
stream
)
{
if
(
std
::
is_same
<
T
,
float
>::
value
||
(
size
%
2
!=
0
))
{
dim3
block
(
128
);
dim3
grid
((
size
+
127
)
/
128
);
sigmoid_kernel
<<<
grid
,
block
,
0
,
stream
>>>
(
data
,
size
,
scale
);
}
else
{
dim3
block
(
128
);
dim3
grid
((
size
+
255
)
/
256
);
sigmoid_kernel
<<<
grid
,
block
,
0
,
stream
>>>
((
half2
*
)
data
,
size
,
scale
);
}
}
template
void
invokeSigmoid
(
float
*
data
,
const
int
size
,
const
float
scale
,
cudaStream_t
stream
);
template
void
invokeSigmoid
(
half
*
data
,
const
int
size
,
const
float
scale
,
cudaStream_t
stream
);
}
// namespace turbomind
src/turbomind/kernels/beam_search_penalty_kernels.cu
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <assert.h>
#include "src/turbomind/kernels/beam_search_penalty_kernels.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
namespace
turbomind
{
template
<
typename
T
>
__global__
void
add_bias_temperature
(
T
*
logits
,
const
T
*
bias
,
const
int
batch_size
,
const
int
beam_width
,
const
int
vocab_size
,
const
int
vocab_size_padded
,
const
float
temperature
)
{
int
tid
=
threadIdx
.
x
;
int
bid
=
blockIdx
.
x
;
int
bbid
=
blockIdx
.
y
;
logits
+=
bbid
*
vocab_size_padded
;
const
T
MASK_VAL
=
(
std
::
is_same
<
T
,
half
>::
value
)
?
-
HALF_FLT_MAX
:
-
FLT_MAX
;
const
T
inv_temp
=
static_cast
<
T
>
(
1.0
f
/
(
temperature
+
1e-6
f
));
for
(
int
i
=
tid
+
bid
*
blockDim
.
x
;
i
<
vocab_size_padded
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
if
(
i
<
vocab_size
)
{
T
bias_val
=
bias
==
nullptr
?
(
T
)(
0.0
f
)
:
bias
[
i
];
logits
[
i
]
=
(
logits
[
i
]
+
bias_val
)
*
inv_temp
;
}
else
{
logits
[
i
]
=
MASK_VAL
;
}
}
}
template
<
>
__global__
void
add_bias_temperature
(
half2
*
logits
,
const
half2
*
bias
,
const
int
batch_size
,
const
int
beam_width
,
const
int
vocab_size
,
const
int
vocab_size_padded
,
const
float
temperature
)
{
assert
(
vocab_size
%
2
==
0
);
assert
(
vocab_size_padded
%
2
==
0
);
const
int
tid
=
threadIdx
.
x
;
const
int
bid
=
blockIdx
.
x
;
const
int
bbid
=
blockIdx
.
y
;
const
half2
mask_val
=
__float2half2_rn
(
-
HALF_FLT_MAX
);
const
half2
inv_temp
=
__float2half2_rn
(
1.0
f
/
(
temperature
+
1e-6
f
));
const
int
half_vocab_size
=
vocab_size
/
2
;
const
int
half_vocab_size_padded
=
vocab_size_padded
/
2
;
logits
+=
bbid
*
half_vocab_size_padded
;
for
(
int
index
=
tid
+
bid
*
blockDim
.
x
;
index
<
half_vocab_size_padded
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
vocab_idx
=
index
%
half_vocab_size_padded
;
half2
logit
=
vocab_idx
<
half_vocab_size
?
__ldg
(
&
logits
[
index
])
:
mask_val
;
if
(
vocab_idx
<
half_vocab_size
)
{
if
(
bias
!=
nullptr
)
{
logit
=
__hadd2
(
logit
,
bias
[
vocab_idx
]);
}
logit
=
__hmul2
(
logit
,
inv_temp
);
}
logits
[
index
]
=
logit
;
}
}
template
<
typename
T
,
bool
IS_ADDITIVE
>
__global__
void
apply_repetition_penalty
(
T
*
logits
,
const
int
batch_size
,
const
int
beam_width
,
const
int
vocab_size
,
const
int
vocab_size_padded
,
const
int
step
,
const
int
*
current_ids
,
const
int
*
previous_ids
,
const
int
*
parent_ids
,
const
int
*
input_lengths
,
const
int
max_input_length
,
const
float
repetition_penalty
)
{
assert
(
step
>
0
);
const
int
tid
=
threadIdx
.
x
;
const
int
bbid
=
blockIdx
.
x
;
const
int
batch_id
=
bbid
/
beam_width
;
const
int
bbsize
=
batch_size
*
beam_width
;
logits
+=
bbid
*
vocab_size_padded
;
extern
__shared__
char
sbuf
[];
T
*
penalty_logits
=
reinterpret_cast
<
T
*>
(
sbuf
);
// prevent misaligment when sizeof(T) = 2
int
*
penalty_indices
=
reinterpret_cast
<
int
*>
(
sbuf
+
(
sizeof
(
T
)
*
step
+
31
)
/
32
*
32
);
const
int
input_length
=
(
input_lengths
!=
nullptr
)
?
input_lengths
[
bbid
]
:
max_input_length
;
if
(
tid
==
0
)
{
T
repet_penalty
=
static_cast
<
T
>
(
repetition_penalty
);
int
prev_id
=
current_ids
[
bbid
];
T
prev_logit
=
logits
[
prev_id
];
penalty_indices
[
step
-
1
]
=
prev_id
;
if
(
IS_ADDITIVE
)
{
penalty_logits
[
step
-
1
]
=
prev_logit
-
repet_penalty
;
}
else
{
penalty_logits
[
step
-
1
]
=
prev_logit
>
T
(
0
)
?
prev_logit
/
repet_penalty
:
prev_logit
*
repet_penalty
;
}
if
(
step
>
1
)
{
int
parent_beam
=
bbid
%
beam_width
;
for
(
int
i
=
step
-
2
;
i
>=
0
;
--
i
)
{
// Skip the padded tokens.
if
(
i
>=
input_length
&&
i
<
max_input_length
)
{
continue
;
}
parent_beam
=
parent_ids
[
i
*
bbsize
+
batch_id
*
beam_width
+
parent_beam
];
prev_id
=
previous_ids
[
i
*
bbsize
+
batch_id
*
beam_width
+
parent_beam
];
prev_logit
=
logits
[
prev_id
];
penalty_indices
[
i
]
=
prev_id
;
if
(
IS_ADDITIVE
)
{
penalty_logits
[
i
]
=
prev_logit
-
repet_penalty
;
}
else
{
penalty_logits
[
i
]
=
prev_logit
>
T
(
0
)
?
prev_logit
/
repet_penalty
:
prev_logit
*
repet_penalty
;
}
}
}
}
__syncthreads
();
for
(
int
i
=
tid
;
i
<
step
;
i
+=
blockDim
.
x
)
{
if
(
i
>=
input_length
&&
i
<
max_input_length
)
{
continue
;
}
logits
[
penalty_indices
[
i
]]
=
penalty_logits
[
i
];
}
}
template
<
typename
T
>
__global__
void
apply_min_length_penalty
(
T
*
logits
,
const
int
min_length
,
const
int
*
end_ids
,
const
int
*
sequence_lengths
,
const
int
max_input_length
,
const
int
beam_width
,
const
int
vocab_size_padded
)
{
int
bbid
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
// batch-beam index
int
bid
=
bbid
/
beam_width
;
// batch index
// We need +1 because sequence_lengths = max_input_length + num_gen_tokens - 1,
// which is equal to the length of k/v caches.
if
(
sequence_lengths
[
bbid
]
+
1
-
max_input_length
<
min_length
)
{
T
mask_val
=
(
std
::
is_same
<
T
,
half
>::
value
)
?
-
HALF_FLT_MAX
:
-
FLT_MAX
;
logits
[
bbid
*
vocab_size_padded
+
end_ids
[
bid
]]
=
mask_val
;
}
}
template
<
typename
T
>
void
invokeAddBiasApplyPenalties
(
int
step
,
T
*
logits
,
const
int
*
current_ids
,
const
int
*
previous_ids
,
const
int
*
parent_ids
,
const
int
*
input_lengths
,
const
int
*
sequence_lengths
,
const
T
*
bias
,
const
int
ite
,
const
int
max_input_length
,
const
int
local_batch_size
,
const
int
batch_size
,
const
int
beam_width
,
const
int
vocab_size
,
const
int
vocab_size_padded
,
const
int
*
end_ids
,
const
float
temperature
,
const
float
repetition_penalty
,
const
RepetitionPenaltyType
repetition_penalty_type
,
const
int
min_length
,
cudaStream_t
stream
)
{
if
(
bias
!=
nullptr
||
temperature
!=
1.0
f
||
vocab_size
!=
vocab_size_padded
)
{
dim3
block
(
512
);
if
(
std
::
is_same
<
T
,
half
>::
value
&&
vocab_size
%
2
==
0
&&
vocab_size_padded
%
2
==
0
)
{
dim3
grid
((
vocab_size_padded
/
2
+
block
.
x
-
1
)
/
block
.
x
,
beam_width
*
local_batch_size
);
add_bias_temperature
<<<
grid
,
block
,
0
,
stream
>>>
(
reinterpret_cast
<
half2
*>
(
logits
),
reinterpret_cast
<
const
half2
*>
(
bias
),
batch_size
,
beam_width
,
vocab_size
,
vocab_size_padded
,
temperature
);
}
else
{
dim3
grid
((
vocab_size_padded
+
block
.
x
-
1
)
/
block
.
x
,
beam_width
*
local_batch_size
);
add_bias_temperature
<<<
grid
,
block
,
0
,
stream
>>>
(
logits
,
bias
,
batch_size
,
beam_width
,
vocab_size
,
vocab_size_padded
,
temperature
);
}
}
if
(
repetition_penalty_type
!=
RepetitionPenaltyType
::
None
&&
step
>
0
)
{
if
(
repetition_penalty
!=
getDefaultPenaltyValue
(
repetition_penalty_type
))
{
size_t
smem_size
=
(
sizeof
(
T
)
*
step
+
31
)
/
32
*
32
+
sizeof
(
int
)
*
step
;
dim3
block
(
256
);
dim3
grid
(
beam_width
*
local_batch_size
);
if
(
repetition_penalty_type
==
RepetitionPenaltyType
::
Multiplicative
)
{
apply_repetition_penalty
<
T
,
false
>
<<<
grid
,
block
,
smem_size
,
stream
>>>
(
logits
,
batch_size
,
beam_width
,
vocab_size
,
vocab_size_padded
,
step
,
current_ids
,
previous_ids
,
// TODO(jaedeokk):
// Remove (+ite ...) by getting parent_ids with offset
// and then remove 'ite' argument from the function.
parent_ids
+
ite
*
beam_width
*
local_batch_size
,
input_lengths
,
max_input_length
,
repetition_penalty
);
}
else
if
(
repetition_penalty_type
==
RepetitionPenaltyType
::
Additive
)
{
apply_repetition_penalty
<
T
,
true
>
<<<
grid
,
block
,
smem_size
,
stream
>>>
(
logits
,
batch_size
,
beam_width
,
vocab_size
,
vocab_size_padded
,
step
,
current_ids
,
previous_ids
,
parent_ids
+
ite
*
beam_width
*
local_batch_size
,
input_lengths
,
max_input_length
,
repetition_penalty
);
}
}
}
if
(
step
-
max_input_length
<
min_length
)
{
FT_CHECK_WITH_INFO
(
sequence_lengths
!=
nullptr
,
"Need sequence_lengths to apply min length penlaty"
);
FT_CHECK_WITH_INFO
(
end_ids
!=
nullptr
,
"Need end_id to apply min length penlaty"
);
const
int
block_size
=
min
(
local_batch_size
*
beam_width
,
1024
);
const
int
grid_size
=
(
local_batch_size
*
beam_width
+
block_size
-
1
)
/
block_size
;
apply_min_length_penalty
<<<
grid_size
,
block_size
,
0
,
stream
>>>
(
logits
,
min_length
,
end_ids
,
sequence_lengths
,
max_input_length
,
beam_width
,
vocab_size_padded
);
}
}
template
void
invokeAddBiasApplyPenalties
(
int
step
,
float
*
logits
,
const
int
*
current_ids
,
const
int
*
previous_ids
,
const
int
*
parent_ids
,
const
int
*
input_lengths
,
const
int
*
sequence_lengths
,
const
float
*
bias
,
const
int
ite
,
const
int
max_input_length
,
const
int
local_batch_size
,
const
int
batch_size
,
const
int
beam_width
,
const
int
vocab_size
,
const
int
vocab_size_padded
,
const
int
*
end_ids
,
const
float
temperature
,
const
float
repetition_penalty
,
const
RepetitionPenaltyType
repetition_penalty_type
,
const
int
min_length
,
cudaStream_t
stream
);
template
void
invokeAddBiasApplyPenalties
(
int
step
,
half
*
logits
,
const
int
*
current_ids
,
const
int
*
previous_ids
,
const
int
*
parent_ids
,
const
int
*
input_lengths
,
const
int
*
sequence_lengths
,
const
half
*
bias
,
const
int
ite
,
const
int
max_input_length
,
const
int
local_batch_size
,
const
int
batch_size
,
const
int
beam_width
,
const
int
vocab_size
,
const
int
vocab_size_padded
,
const
int
*
end_ids
,
const
float
temperature
,
const
float
repetition_penalty
,
const
RepetitionPenaltyType
repetition_penalty_type
,
const
int
min_length
,
cudaStream_t
stream
);
}
// namespace turbomind
src/turbomind/kernels/beam_search_penalty_kernels.h
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cuda_fp16.h>
#include "src/turbomind/kernels/penalty_types.h"
#include "src/turbomind/utils/cuda_utils.h"
namespace
turbomind
{
template
<
typename
T
>
void
invokeAddBiasApplyPenalties
(
int
step
,
T
*
logits
,
const
int
*
current_ids
,
const
int
*
previous_ids
,
const
int
*
parent_ids
,
const
int
*
input_lengths
,
const
int
*
sequence_lengths
,
const
T
*
bias
,
const
int
ite
,
const
int
max_input_length
,
const
int
local_batch_size
,
const
int
batch_size
,
const
int
beam_width
,
const
int
vocab_size
,
const
int
vocab_size_padded
,
const
int
*
end_ids
,
const
float
temperature
,
const
float
repetition_penalty
,
const
RepetitionPenaltyType
repetition_penalty_type
,
const
int
min_length
,
cudaStream_t
stream
);
}
// namespace turbomind
src/turbomind/kernels/beam_search_topk_kernels.cu
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#elif (CUDART_VERSION >= 11050)
#include <cub/cub.cuh>
#else
#include "3rdparty/cub/cub.cuh"
#endif
#include "src/turbomind/kernels/beam_search_topk_kernels.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/logger.h"
namespace
turbomind
{
template
<
typename
T
>
__device__
__forceinline__
T
apply_length_penalty
(
T
log_prob
,
int
length
,
float
length_penalty
)
{
// score = log(prob) / (length)^length_penalty.
if
(
length_penalty
==
0.0
f
||
length
==
1
)
{
return
log_prob
;
}
return
log_prob
/
static_cast
<
T
>
(
powf
((
float
)
length
,
length_penalty
));
}
template
<
typename
T
,
int
MAX_K
,
int
THREADBLOCK_SIZE
>
__launch_bounds__
(
THREADBLOCK_SIZE
)
__global__
void
beam_topK_kernel
(
const
T
*
log_probs
,
int
*
topk_tmp_id_buf
,
T
*
topk_tmp_val_buf
,
const
bool
*
finished
,
const
int
*
sequence_lengths
,
const
int
vocab_size
,
T
diversity_rate
,
float
length_penalty
)
{
typedef
cub
::
BlockReduce
<
TopK
<
T
,
MAX_K
>
,
THREADBLOCK_SIZE
>
BlockReduce
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
int
thread_id
=
threadIdx
.
x
;
int
block_id
=
blockIdx
.
x
;
// batch beam index.
TopK
<
T
,
MAX_K
>
partial
;
const
bool
IS_FP16
=
std
::
is_same
<
T
,
half
>::
value
;
const
T
MAX_T_VAL
=
(
IS_FP16
)
?
HALF_FLT_MAX
:
FLT_MAX
;
#pragma unroll
for
(
int
i
=
0
;
i
<
MAX_K
;
++
i
)
{
partial
.
p
[
i
]
=
-
1
;
partial
.
u
[
i
]
=
-
MAX_T_VAL
;
}
#pragma unroll
for
(
int
elem_id
=
thread_id
;
elem_id
<
vocab_size
;
elem_id
+=
THREADBLOCK_SIZE
)
{
int
index
=
elem_id
+
block_id
*
vocab_size
;
T
score
=
length_penalty
==
0.0
f
?
log_probs
[
index
]
:
apply_length_penalty
(
log_probs
[
index
],
finished
[
block_id
]
?
sequence_lengths
[
block_id
]
:
sequence_lengths
[
block_id
]
+
1
,
length_penalty
);
partial
.
insert
(
score
,
index
);
}
TopK
<
T
,
MAX_K
>
total
=
BlockReduce
(
temp_storage
).
Reduce
(
partial
,
reduce_topk_op
<
T
,
MAX_K
>
);
if
(
thread_id
==
0
)
{
int
index
=
block_id
*
MAX_K
;
#pragma unroll
for
(
int
i
=
0
;
i
<
MAX_K
;
++
i
)
{
topk_tmp_id_buf
[
index
+
i
]
=
total
.
p
[
i
];
topk_tmp_val_buf
[
index
+
i
]
=
total
.
u
[
i
]
+
diversity_rate
*
(
T
)
i
;
}
}
}
template
<
typename
T
,
int
MAX_K
,
int
THREADBLOCK_SIZE
>
__launch_bounds__
(
THREADBLOCK_SIZE
)
__global__
void
batch_topK_kernel
(
int
*
topk_tmp_id_buf
,
T
*
topk_tmp_val_buf
,
int
*
id_buf
)
{
int
thread_id
=
threadIdx
.
x
;
int
block_id
=
blockIdx
.
x
;
const
bool
IS_FP16
=
std
::
is_same
<
T
,
half
>::
value
;
const
T
MAX_T_VAL
=
(
IS_FP16
)
?
HALF_FLT_MAX
:
FLT_MAX
;
TopK
<
T
,
MAX_K
>
partial
;
if
(
thread_id
==
0
)
{
for
(
int
i
=
0
;
i
<
MAX_K
;
++
i
)
{
partial
.
p
[
i
]
=
-
1
;
partial
.
u
[
i
]
=
-
MAX_T_VAL
;
}
int
index
=
block_id
*
MAX_K
*
MAX_K
;
for
(
int
i
=
0
;
i
<
MAX_K
*
MAX_K
;
i
++
)
{
partial
.
insert
((
T
)
topk_tmp_val_buf
[
index
+
i
],
topk_tmp_id_buf
[
index
+
i
]);
}
index
=
block_id
*
MAX_K
;
for
(
int
i
=
0
;
i
<
MAX_K
;
i
++
)
{
id_buf
[
index
+
i
]
=
partial
.
p
[
i
];
}
}
}
template
<
typename
T
,
int
MAX_K
,
int
THREADBLOCK_SIZE
>
__launch_bounds__
(
THREADBLOCK_SIZE
)
__global__
void
batch_topK_kernel_v2
(
int
*
topk_tmp_id_buf
,
T
*
topk_tmp_val_buf
,
int
*
id_buf
)
{
typedef
cub
::
BlockReduce
<
TopK
<
T
,
MAX_K
>
,
THREADBLOCK_SIZE
>
BlockReduce
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
int
tid
=
threadIdx
.
x
;
int
bid
=
blockIdx
.
x
;
TopK
<
T
,
MAX_K
>
partial
;
const
bool
IS_FP16
=
std
::
is_same
<
T
,
half
>::
value
;
const
T
MAX_T_VAL
=
(
IS_FP16
)
?
HALF_FLT_MAX
:
FLT_MAX
;
#pragma unroll
for
(
int
i
=
0
;
i
<
MAX_K
;
++
i
)
{
partial
.
p
[
i
]
=
-
1
;
partial
.
u
[
i
]
=
-
MAX_T_VAL
;
}
int
ite
=
MAX_K
*
MAX_K
/
THREADBLOCK_SIZE
;
#pragma unroll
for
(
int
i
=
0
;
i
<
ite
;
i
++
)
{
int
index
=
bid
*
MAX_K
*
MAX_K
+
i
*
THREADBLOCK_SIZE
+
tid
;
partial
.
insert
((
T
)
topk_tmp_val_buf
[
index
],
topk_tmp_id_buf
[
index
]);
}
TopK
<
T
,
MAX_K
>
total
=
BlockReduce
(
temp_storage
).
Reduce
(
partial
,
reduce_topk_op
<
T
,
MAX_K
>
);
if
(
tid
==
0
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
MAX_K
;
i
++
)
{
id_buf
[
bid
*
MAX_K
+
i
]
=
total
.
p
[
i
];
}
}
}
template
<
typename
T
,
int
BLOCK_SIZE_
,
int
BLOCKS_PER_BEAM_
>
__global__
void
topk_stage_1_opt3
(
const
T
*
__restrict
log_probs
,
T
*
tmp_log_probs
,
int
*
topk_tmp_id_buf
,
T
*
topk_tmp_val_buf
,
const
bool
*
finished
,
const
int
*
sequence_lengths
,
const
int
k
,
const
int
vocab_size
,
const
float
length_penalty
,
const
int
*
end_ids
)
{
typedef
cub
::
BlockReduce
<
TopK_2
<
T
>
,
BLOCK_SIZE_
>
BlockReduce
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
const
int
tid
=
threadIdx
.
x
;
const
int
bid
=
blockIdx
.
x
;
const
int
row_id
=
bid
/
BLOCKS_PER_BEAM_
;
// row id for log_probs (batchbeam index)
const
int
block_lane
=
bid
%
BLOCKS_PER_BEAM_
;
// block id for a beam
const
int
tmp_log_buf_index
=
row_id
*
vocab_size
;
const
int
tmp_topk_buf_index
=
row_id
*
BLOCKS_PER_BEAM_
*
k
+
block_lane
*
k
;
TopK_2
<
T
>
partial
;
const
bool
IS_FP16
=
std
::
is_same
<
T
,
half
>::
value
;
const
T
MAX_T_VAL
=
(
IS_FP16
)
?
HALF_FLT_MAX
:
FLT_MAX
;
if
(
finished
!=
nullptr
&&
finished
[
row_id
]
==
true
)
{
if
(
tid
<
k
)
{
const
int
index
=
tmp_topk_buf_index
+
tid
;
if
(
block_lane
==
0
&&
tid
==
0
)
{
const
int
end_id
=
end_ids
[
row_id
/
k
];
topk_tmp_id_buf
[
index
]
=
tmp_log_buf_index
+
end_id
;
topk_tmp_val_buf
[
index
]
=
log_probs
[
tmp_log_buf_index
+
end_id
];
}
else
{
topk_tmp_id_buf
[
index
]
=
-
1
;
topk_tmp_val_buf
[
index
]
=
-
MAX_T_VAL
;
}
}
return
;
}
for
(
int
elem_id
=
tid
+
block_lane
*
BLOCK_SIZE_
;
elem_id
<
vocab_size
;
elem_id
+=
BLOCK_SIZE_
*
BLOCKS_PER_BEAM_
)
{
int
index
=
elem_id
+
tmp_log_buf_index
;
tmp_log_probs
[
index
]
=
log_probs
[
index
];
}
for
(
int
ite
=
0
;
ite
<
k
;
ite
++
)
{
partial
.
init
();
#pragma unroll
for
(
int
elem_id
=
tid
+
block_lane
*
BLOCK_SIZE_
;
elem_id
<
vocab_size
;
elem_id
+=
BLOCK_SIZE_
*
BLOCKS_PER_BEAM_
)
{
int
index
=
elem_id
+
tmp_log_buf_index
;
partial
.
insert
(
tmp_log_probs
[
index
],
index
);
}
TopK_2
<
T
>
total
=
BlockReduce
(
temp_storage
).
Reduce
(
partial
,
reduce_topk_op_2
<
T
>
);
if
(
tid
==
0
)
{
const
int
index
=
tmp_topk_buf_index
+
ite
;
topk_tmp_id_buf
[
index
]
=
total
.
p
;
topk_tmp_val_buf
[
index
]
=
total
.
u
;
tmp_log_probs
[
total
.
p
]
=
-
MAX_T_VAL
;
}
__syncthreads
();
}
}
template
<
typename
T
,
int
BLOCK_SIZE_
,
int
BLOCKS_PER_BEAM_
>
__global__
void
topk_stage_2_opt3
(
const
int
*
__restrict
topk_tmp_id_buf
,
T
*
topk_tmp_val_buf
,
int
*
ids
,
BeamHypotheses
beam_hyps
,
const
int
*
end_ids
,
const
int
vocab_size
,
const
int
k
)
{
const
int
size
=
k
*
k
*
BLOCKS_PER_BEAM_
;
const
int
tid
=
threadIdx
.
x
;
const
int
batch_id
=
blockIdx
.
x
;
const
bool
IS_FP16
=
std
::
is_same
<
T
,
half
>::
value
;
const
T
MAX_T_VAL
=
(
IS_FP16
)
?
HALF_FLT_MAX
:
FLT_MAX
;
typedef
cub
::
BlockReduce
<
TopK_2
<
T
>
,
BLOCK_SIZE_
>
BlockReduce
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
extern
__shared__
char
array
[];
T
*
s_val
=
topk_tmp_val_buf
+
batch_id
*
size
;
int
*
s_id
=
(
int
*
)(
array
);
__shared__
int
selected_beams
;
__shared__
bool
is_stop
;
if
(
tid
==
0
)
{
selected_beams
=
0
;
is_stop
=
false
;
}
__syncthreads
();
if
(
beam_hyps
.
num_beams
!=
nullptr
)
{
const
int
global_batch_idx
=
beam_hyps
.
ite
*
beam_hyps
.
local_batch_size
+
batch_id
;
if
(
beam_hyps
.
num_beams
[
global_batch_idx
]
==
0
&&
tid
==
0
)
{
// initialize the buffer
beam_hyps
.
min_normed_scores
[
global_batch_idx
]
=
FLT_MAX
;
}
else
if
(
beam_hyps
.
num_beams
[
global_batch_idx
]
==
k
)
{
return
;
}
}
TopK_2
<
T
>
partial
;
// In some cases, we may encounter k finished sentences, but scores are bad. So, the max iteration
// is 2*k here
for
(
int
ite
=
0
;
ite
<
2
*
k
;
ite
++
)
{
partial
.
init
();
#pragma unroll
for
(
int
i
=
tid
;
i
<
size
;
i
+=
BLOCK_SIZE_
)
{
partial
.
insert
(
s_val
[
i
],
i
);
}
TopK_2
<
T
>
total
=
BlockReduce
(
temp_storage
).
Reduce
(
partial
,
reduce_topk_op_2
<
T
>
);
if
(
tid
==
0
)
{
if
(
beam_hyps
.
num_beams
!=
nullptr
&&
topk_tmp_id_buf
[
batch_id
*
size
+
total
.
p
]
%
vocab_size
==
end_ids
[
batch_id
])
{
// if beam_token does not belong to top num_beams tokens, it should not be added. Refer from
// https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/generation_beam_search.py#L257
if
(
ite
>=
k
)
{
s_val
[
total
.
p
]
=
-
MAX_T_VAL
;
}
else
{
const
int
global_batch_idx
=
beam_hyps
.
ite
*
beam_hyps
.
local_batch_size
+
batch_id
;
const
float
normed_score
=
apply_length_penalty
(
s_val
[
total
.
p
],
beam_hyps
.
step
,
beam_hyps
.
length_penalty
);
const
int
num_beam
=
beam_hyps
.
num_beams
[
global_batch_idx
];
int
beam_idx
=
num_beam
;
// If there are beam_width finished sentences, check that the score of selected candidatet
// is higher than min_normed_score or not. If current score is better, replace worst one
// and update the min_normed_score.
if
(
num_beam
==
k
)
{
if
(
normed_score
<
beam_hyps
.
min_normed_scores
[
global_batch_idx
])
{
// end the tracing and exist this for loop
selected_beams
=
k
;
is_stop
=
true
;
break
;
}
else
{
// find the beam index which's score = min_normed_score, erase it.
for
(
int
j
=
0
;
j
<
k
;
j
++
)
{
if
(
beam_hyps
.
normed_scores
[
global_batch_idx
*
k
+
j
]
==
beam_hyps
.
min_normed_scores
[
global_batch_idx
])
{
beam_idx
=
j
;
beam_hyps
.
num_beams
[
global_batch_idx
]
--
;
beam_hyps
.
min_normed_scores
[
global_batch_idx
]
=
FLT_MAX
;
beam_hyps
.
normed_scores
[
global_batch_idx
*
k
+
j
]
=
normed_score
;
for
(
int
l
=
0
;
l
<
k
;
l
++
)
{
beam_hyps
.
min_normed_scores
[
global_batch_idx
]
=
min
(
beam_hyps
.
min_normed_scores
[
global_batch_idx
],
beam_hyps
.
normed_scores
[
global_batch_idx
*
k
+
l
]);
}
break
;
}
}
}
}
const
int
tgt_id_offset
=
((
batch_id
+
beam_hyps
.
ite
*
beam_hyps
.
local_batch_size
)
*
k
+
beam_idx
)
*
(
beam_hyps
.
max_seq_len
);
beam_hyps
.
output_ids_tgt
[
tgt_id_offset
+
beam_hyps
.
step
]
=
end_ids
[
batch_id
];
int
prev_id
=
(
topk_tmp_id_buf
[
batch_id
*
size
+
total
.
p
]
/
vocab_size
)
%
k
;
for
(
int
j
=
beam_hyps
.
step
-
1
;
j
>=
0
;
j
--
)
{
const
int
src_idx
=
j
*
beam_hyps
.
batch_size
*
k
+
beam_hyps
.
ite
*
beam_hyps
.
local_batch_size
*
k
+
batch_id
*
k
+
prev_id
;
beam_hyps
.
output_ids_tgt
[
tgt_id_offset
+
j
]
=
beam_hyps
.
output_ids_src
[
src_idx
];
prev_id
=
beam_hyps
.
parent_ids_src
[
src_idx
];
}
const
int
tgt_beam_idx
=
global_batch_idx
*
k
+
beam_idx
;
beam_hyps
.
sequence_lengths_tgt
[
tgt_beam_idx
]
=
beam_hyps
.
step
;
beam_hyps
.
normed_scores
[
tgt_beam_idx
]
=
normed_score
;
beam_hyps
.
min_normed_scores
[
global_batch_idx
]
=
min
(
beam_hyps
.
min_normed_scores
[
global_batch_idx
],
beam_hyps
.
normed_scores
[
tgt_beam_idx
]);
s_val
[
total
.
p
]
=
-
MAX_T_VAL
;
beam_hyps
.
num_beams
[
global_batch_idx
]
++
;
}
}
else
{
s_id
[
selected_beams
]
=
total
.
p
;
s_val
[
total
.
p
]
=
-
MAX_T_VAL
;
selected_beams
++
;
}
}
__syncthreads
();
if
(
selected_beams
>=
k
)
{
break
;
}
}
if
(
tid
<
k
&&
is_stop
==
false
)
{
ids
[
batch_id
*
k
+
tid
]
=
topk_tmp_id_buf
[
batch_id
*
size
+
s_id
[
tid
]];
}
}
template
<
typename
T
,
int
BLOCK_SIZE
,
int
BLOCKS_PER_BEAM
>
__global__
void
topk_stage_1_opt2_general
(
const
T
*
__restrict
log_probs
,
T
*
tmp_log_probs
,
int
*
topk_tmp_id_buf
,
T
*
topk_tmp_val_buf
,
const
bool
*
finished
,
const
int
*
sequence_lengths
,
const
int
k
,
const
int
vocab_size
,
const
float
length_penalty
)
{
const
bool
IS_FP16
=
std
::
is_same
<
T
,
half
>::
value
;
const
T
MAX_T_VAL
=
(
IS_FP16
)
?
HALF_FLT_MAX
:
FLT_MAX
;
typedef
cub
::
BlockReduce
<
TopK_2
<
T
>
,
BLOCK_SIZE
>
BlockReduce
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
const
int
tid
=
threadIdx
.
x
;
const
int
bid
=
blockIdx
.
x
;
const
int
row_id
=
bid
/
BLOCKS_PER_BEAM
;
// row id for log_probs
const
int
block_lane
=
bid
%
BLOCKS_PER_BEAM
;
// block id for a beam
const
int
tmp_log_buf_index
=
row_id
*
vocab_size
;
const
int
tmp_topk_buf_index
=
row_id
*
BLOCKS_PER_BEAM
*
k
+
block_lane
*
k
;
TopK_2
<
T
>
partial
;
for
(
int
elem_id
=
tid
+
block_lane
*
BLOCK_SIZE
;
elem_id
<
vocab_size
;
elem_id
+=
BLOCK_SIZE
*
BLOCKS_PER_BEAM
)
{
int
index
=
elem_id
+
tmp_log_buf_index
;
tmp_log_probs
[
index
]
=
log_probs
[
index
];
}
for
(
int
ite
=
0
;
ite
<
k
;
ite
++
)
{
partial
.
init
();
#pragma unroll
for
(
int
elem_id
=
tid
+
block_lane
*
BLOCK_SIZE
;
elem_id
<
vocab_size
;
elem_id
+=
BLOCK_SIZE
*
BLOCKS_PER_BEAM
)
{
int
index
=
elem_id
+
tmp_log_buf_index
;
partial
.
insert
(
tmp_log_probs
[
index
],
index
);
}
TopK_2
<
T
>
total
=
BlockReduce
(
temp_storage
).
Reduce
(
partial
,
reduce_topk_op_2
<
T
>
);
if
(
tid
==
0
)
{
const
int
index
=
tmp_topk_buf_index
+
ite
;
topk_tmp_id_buf
[
index
]
=
total
.
p
;
topk_tmp_val_buf
[
index
]
=
total
.
u
;
tmp_log_probs
[
total
.
p
]
=
-
MAX_T_VAL
;
}
__syncthreads
();
}
}
template
<
typename
T
,
int
BLOCK_SIZE
,
int
BLOCKS_PER_BEAM
>
__global__
void
topk_stage_2_opt2_general
(
const
int
*
__restrict
topk_tmp_id_buf
,
T
*
topk_tmp_val_buf
,
int
*
ids
,
BeamHypotheses
beam_hyps
,
const
int
*
end_ids
,
const
int
k
,
const
int
vocab_size
)
{
const
int
size
=
k
*
k
*
BLOCKS_PER_BEAM
;
const
int
tid
=
threadIdx
.
x
;
const
int
batch_id
=
blockIdx
.
x
;
const
bool
IS_FP16
=
std
::
is_same
<
T
,
half
>::
value
;
const
T
MAX_T_VAL
=
(
IS_FP16
)
?
HALF_FLT_MAX
:
FLT_MAX
;
typedef
cub
::
BlockReduce
<
TopK_2
<
T
>
,
BLOCK_SIZE
>
BlockReduce
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
extern
__shared__
char
array
[];
T
*
s_val
=
topk_tmp_val_buf
+
batch_id
*
size
;
int
*
s_id
=
(
int
*
)(
array
);
__shared__
int
selected_beams
;
__shared__
bool
is_stop
;
if
(
tid
==
0
)
{
selected_beams
=
0
;
is_stop
=
false
;
}
__syncthreads
();
if
(
beam_hyps
.
num_beams
!=
nullptr
)
{
const
int
global_batch_idx
=
beam_hyps
.
ite
*
beam_hyps
.
local_batch_size
+
batch_id
;
if
(
beam_hyps
.
num_beams
[
global_batch_idx
]
==
0
&&
tid
==
0
)
{
beam_hyps
.
min_normed_scores
[
global_batch_idx
]
=
FLT_MAX
;
}
else
if
(
beam_hyps
.
num_beams
[
global_batch_idx
]
==
k
)
{
return
;
}
}
TopK_2
<
T
>
partial
;
// In some cases, we may encounter k finished sentences, but scores are bad. So, the max iteration
// is 2*k here
for
(
int
ite
=
0
;
ite
<
2
*
k
;
ite
++
)
{
partial
.
init
();
#pragma unroll
for
(
int
i
=
tid
;
i
<
size
;
i
+=
BLOCK_SIZE
)
{
partial
.
insert
(
s_val
[
i
],
i
);
}
TopK_2
<
T
>
total
=
BlockReduce
(
temp_storage
).
Reduce
(
partial
,
reduce_topk_op_2
<
T
>
);
if
(
tid
==
0
)
{
if
(
beam_hyps
.
num_beams
!=
nullptr
&&
topk_tmp_id_buf
[
batch_id
*
size
+
total
.
p
]
%
vocab_size
==
end_ids
[
batch_id
])
{
// if beam_token does not belong to top num_beams tokens, it should not be added. Refer from
// https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/generation_beam_search.py#L257
if
(
ite
>=
k
)
{
s_val
[
total
.
p
]
=
-
MAX_T_VAL
;
}
else
{
const
int
global_batch_idx
=
beam_hyps
.
ite
*
beam_hyps
.
local_batch_size
+
batch_id
;
const
float
normed_score
=
apply_length_penalty
(
s_val
[
total
.
p
],
beam_hyps
.
step
,
beam_hyps
.
length_penalty
);
const
int
num_beam
=
beam_hyps
.
num_beams
[
global_batch_idx
];
int
beam_idx
=
num_beam
;
// If there are beam_width finished sentences, check that the score of selected candidatet
// is higher than min_normed_score or not. If current score is better, replace worst one
// and update the min_normed_score.
if
(
num_beam
==
k
)
{
if
(
normed_score
<
beam_hyps
.
min_normed_scores
[
global_batch_idx
])
{
// end the tracing and exist this for loop
selected_beams
=
k
;
is_stop
=
true
;
break
;
}
else
{
// find the beam index which's score = min_normed_score, erase it.
for
(
int
j
=
0
;
j
<
k
;
j
++
)
{
if
(
beam_hyps
.
normed_scores
[
global_batch_idx
*
k
+
j
]
==
beam_hyps
.
min_normed_scores
[
global_batch_idx
])
{
beam_idx
=
j
;
beam_hyps
.
num_beams
[
global_batch_idx
]
--
;
beam_hyps
.
min_normed_scores
[
global_batch_idx
]
=
FLT_MAX
;
beam_hyps
.
normed_scores
[
global_batch_idx
*
k
+
j
]
=
normed_score
;
for
(
int
l
=
0
;
l
<
k
;
l
++
)
{
beam_hyps
.
min_normed_scores
[
global_batch_idx
]
=
min
(
beam_hyps
.
min_normed_scores
[
global_batch_idx
],
beam_hyps
.
normed_scores
[
global_batch_idx
*
k
+
l
]);
}
break
;
}
}
}
}
const
int
tgt_id_offset
=
((
batch_id
+
beam_hyps
.
ite
*
beam_hyps
.
local_batch_size
)
*
k
+
beam_idx
)
*
(
beam_hyps
.
max_seq_len
);
beam_hyps
.
output_ids_tgt
[
tgt_id_offset
+
beam_hyps
.
step
]
=
end_ids
[
batch_id
];
int
prev_id
=
(
topk_tmp_id_buf
[
batch_id
*
size
+
total
.
p
]
/
vocab_size
)
%
k
;
for
(
int
j
=
beam_hyps
.
step
-
1
;
j
>=
0
;
j
--
)
{
const
int
src_idx
=
j
*
beam_hyps
.
batch_size
*
k
+
beam_hyps
.
ite
*
beam_hyps
.
local_batch_size
*
k
+
batch_id
*
k
+
prev_id
;
beam_hyps
.
output_ids_tgt
[
tgt_id_offset
+
j
]
=
beam_hyps
.
output_ids_src
[
src_idx
];
prev_id
=
beam_hyps
.
parent_ids_src
[
src_idx
];
}
const
int
tgt_beam_idx
=
global_batch_idx
*
k
+
beam_idx
;
beam_hyps
.
sequence_lengths_tgt
[
tgt_beam_idx
]
=
beam_hyps
.
step
;
beam_hyps
.
normed_scores
[
tgt_beam_idx
]
=
normed_score
;
beam_hyps
.
min_normed_scores
[
global_batch_idx
]
=
min
(
beam_hyps
.
min_normed_scores
[
global_batch_idx
],
beam_hyps
.
normed_scores
[
tgt_beam_idx
]);
s_val
[
total
.
p
]
=
-
MAX_T_VAL
;
beam_hyps
.
num_beams
[
global_batch_idx
]
++
;
}
}
else
{
s_id
[
selected_beams
]
=
total
.
p
;
s_val
[
total
.
p
]
=
-
MAX_T_VAL
;
selected_beams
++
;
}
}
__syncthreads
();
if
(
selected_beams
>=
k
)
{
break
;
}
}
if
(
tid
<
k
&&
is_stop
==
false
)
{
ids
[
batch_id
*
k
+
tid
]
=
topk_tmp_id_buf
[
batch_id
*
size
+
s_id
[
tid
]];
}
}
#define CASE_K_DIV(K, BLOCK_SIZE_1, BLOCK_SIZE_2) \
case K: \
beam_topK_kernel<T, K, BLOCK_SIZE_2><<<batch_size * beam_width, BLOCK_SIZE_2, 0, stream>>>(log_probs, \
topk_tmp_id_buf, \
topk_tmp_val_buf, \
finished, \
sequence_lengths, \
vocab_size, \
diversity_rate, \
length_penalty); \
if (K < 10) \
batch_topK_kernel<T, K, BLOCK_SIZE_1> \
<<<batch_size, BLOCK_SIZE_1, 0, stream>>>(topk_tmp_id_buf, topk_tmp_val_buf, ids); \
else \
batch_topK_kernel_v2<T, K, 32><<<batch_size, 32, 0, stream>>>(topk_tmp_id_buf, topk_tmp_val_buf, ids); \
break;
#define CASE_K(K, BLOCK_SIZE_1_, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_) \
case K: \
topk_stage_1_opt3<float, BLOCK_SIZE_1_, BLOCKS_PER_BEAM_> \
<<<batch_size * K * BLOCKS_PER_BEAM_, BLOCK_SIZE_1_, 0, stream>>>(log_probs, \
temp_log_probs, \
topk_tmp_id_buf, \
topk_tmp_val_buf, \
finished, \
sequence_lengths, \
beam_width, \
vocab_size, \
length_penalty, \
end_ids); \
topk_stage_2_opt3<float, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_> \
<<<batch_size, BLOCK_SIZE_2_, K * sizeof(int), stream>>>( \
topk_tmp_id_buf, topk_tmp_val_buf, ids, *beam_hyps, end_ids, vocab_size, beam_width); \
sync_check_cuda_error(); \
break;
template
<
typename
T
>
void
invokeTopkBeamSearch
(
void
*
workspace
,
size_t
&
workspace_size
,
T
*
log_probs
,
int
*
ids
,
BeamHypotheses
*
beam_hyps
,
const
bool
*
finished
,
const
int
*
sequence_lengths
,
const
int
batch_size
,
const
int
beam_width
,
const
int
vocab_size_padded_
,
const
T
diversity_rate
,
const
float
length_penalty
,
const
int
*
end_ids
,
cudaStream_t
stream
)
{
TM_LOG_DEBUG
(
"%s"
,
__PRETTY_FUNCTION__
);
// log_probs: (batch, beam, vocab) cumulative log_probs of beams ending with a token.
const
int
vocab_size
=
vocab_size_padded_
;
// Beam size should be less than or equal to vocab size.
assert
(
beam_width
<=
vocab_size
);
// Beam search needs the sequence lengths of beams to apply length penalty.
assert
(
length_penalty
==
0.0
f
||
sequence_lengths
!=
nullptr
);
const
int
max_block_per_beam
=
8
;
int
temp_log_probs_buf_size
=
batch_size
*
beam_width
*
vocab_size
;
// type float
int
topk_tmp_ids_buf_size
=
batch_size
*
beam_width
*
beam_width
*
max_block_per_beam
;
// type int
int
topk_tmp_val_buf_size
=
batch_size
*
beam_width
*
beam_width
*
max_block_per_beam
;
// type float
// prevent memory misaligned address
temp_log_probs_buf_size
=
(
int
)(
ceil
(
temp_log_probs_buf_size
/
4.
))
*
4
;
topk_tmp_ids_buf_size
=
(
int
)(
ceil
(
topk_tmp_ids_buf_size
/
4.
))
*
4
;
topk_tmp_val_buf_size
=
(
int
)(
ceil
(
topk_tmp_val_buf_size
/
4.
))
*
4
;
if
(
workspace
==
nullptr
)
{
workspace_size
=
sizeof
(
float
)
*
temp_log_probs_buf_size
+
sizeof
(
int
)
*
topk_tmp_ids_buf_size
+
sizeof
(
float
)
*
topk_tmp_val_buf_size
;
return
;
}
else
{
T
*
temp_log_probs
=
(
T
*
)
workspace
;
int
*
topk_tmp_id_buf
=
(
int
*
)(
temp_log_probs
+
temp_log_probs_buf_size
);
T
*
topk_tmp_val_buf
=
(
T
*
)(
topk_tmp_id_buf
+
topk_tmp_ids_buf_size
);
if
(
diversity_rate
==
0.0
f
)
{
switch
(
beam_width
)
{
CASE_K
(
1
,
128
,
128
,
8
);
CASE_K
(
4
,
128
,
128
,
8
);
CASE_K
(
10
,
128
,
128
,
8
);
CASE_K
(
16
,
128
,
128
,
5
);
CASE_K
(
32
,
256
,
128
,
1
);
CASE_K
(
64
,
256
,
256
,
1
);
default:
topk_stage_1_opt2_general
<
T
,
128
,
1
>
<<<
batch_size
*
beam_width
*
1
,
128
,
0
,
stream
>>>
(
log_probs
,
temp_log_probs
,
topk_tmp_id_buf
,
topk_tmp_val_buf
,
finished
,
sequence_lengths
,
beam_width
,
vocab_size
,
length_penalty
);
topk_stage_2_opt2_general
<
T
,
128
,
1
>
<<<
batch_size
,
128
,
beam_width
*
beam_width
*
1
*
sizeof
(
float
)
+
beam_width
*
sizeof
(
int
),
stream
>>>
(
topk_tmp_id_buf
,
topk_tmp_val_buf
,
ids
,
*
beam_hyps
,
end_ids
,
beam_width
,
vocab_size
);
break
;
}
}
else
{
switch
(
beam_width
)
{
CASE_K_DIV
(
1
,
256
,
256
);
CASE_K_DIV
(
4
,
256
,
256
);
CASE_K_DIV
(
16
,
256
,
64
);
CASE_K_DIV
(
32
,
256
,
64
);
CASE_K_DIV
(
64
,
256
,
64
);
default:
FT_CHECK_WITH_INFO
(
false
,
fmtstr
(
"Topk kernel does not support beamwidth = %d
\n
"
,
beam_width
));
break
;
}
}
return
;
}
}
#undef CASE_K
#undef CASE_K_DIV
template
void
invokeTopkBeamSearch
(
void
*
workspace
,
size_t
&
workspace_size
,
float
*
log_probs
,
int
*
ids
,
BeamHypotheses
*
beam_hyps
,
const
bool
*
finished
,
const
int
*
sequence_lengths
,
const
int
batch_size
,
const
int
beam_width
,
const
int
vocab_size_padded_
,
const
float
diversity_rate
,
const
float
length_penalty
,
const
int
*
end_ids
,
cudaStream_t
stream
);
template
<
typename
T
>
__global__
void
tileEncoderResults
(
T
*
tiled_output
,
int
*
tiled_sequence_length
,
const
T
*
output
,
const
int
*
sequence_length
,
const
uint
batch_size
,
const
uint
beam_width
,
const
uint
d_model
)
{
if
(
blockIdx
.
x
==
0
)
{
for
(
uint
i
=
threadIdx
.
x
;
i
<
batch_size
*
beam_width
;
i
+=
blockDim
.
x
)
{
tiled_sequence_length
[
i
]
=
sequence_length
[
i
/
beam_width
];
}
}
int
tgt_offset
=
blockIdx
.
x
*
gridDim
.
y
*
gridDim
.
z
*
d_model
+
blockIdx
.
y
*
gridDim
.
z
*
d_model
+
blockIdx
.
z
*
d_model
;
int
src_offset
=
blockIdx
.
x
*
gridDim
.
z
*
d_model
+
blockIdx
.
z
*
d_model
;
for
(
uint
i
=
threadIdx
.
x
;
i
<
d_model
;
i
+=
blockDim
.
x
)
{
tiled_output
[
i
+
tgt_offset
]
=
output
[
i
+
src_offset
];
}
}
template
<
typename
T
>
void
invokeTileEncoderResults
(
T
*
tiled_output
,
int
*
tiled_sequence_length
,
const
T
*
output
,
const
int
*
sequence_length
,
const
size_t
batch_size
,
const
size_t
beam_width
,
const
size_t
mem_max_seq_len
,
const
size_t
d_model
,
cudaStream_t
stream
)
{
// tiled_output: [batch_size, beam_width, mem_max_seq_len, d_model]
// tiled_sequence_length: [batch_size, beam_width]
// output: [batch_size, mem_max_seq_len, d_model]
// sequence_length [batch_size]
dim3
grid
(
batch_size
,
beam_width
,
mem_max_seq_len
);
bool
is_half2
=
(
std
::
is_same
<
T
,
half
>::
value
)
&&
(
d_model
%
2
==
0
);
if
(
is_half2
)
{
using
T2
=
typename
TypeConverter
<
T
>::
Type
;
// fp16 to half2, bf16 to bf162
dim3
block
(
min
(
512
,
(
int
)(
d_model
/
2
)));
tileEncoderResults
<
T2
><<<
grid
,
block
,
0
,
stream
>>>
((
T2
*
)
tiled_output
,
tiled_sequence_length
,
(
const
T2
*
)
output
,
sequence_length
,
batch_size
,
beam_width
,
d_model
/
2
);
}
else
{
dim3
block
(
min
(
512
,
(
int
)
d_model
));
tileEncoderResults
<
T
><<<
grid
,
block
,
0
,
stream
>>>
(
tiled_output
,
tiled_sequence_length
,
output
,
sequence_length
,
batch_size
,
beam_width
,
d_model
);
}
}
template
void
invokeTileEncoderResults
(
float
*
tiled_output
,
int
*
tiled_sequence_length
,
const
float
*
output
,
const
int
*
sequence_length
,
const
size_t
batch_size
,
const
size_t
beam_width
,
const
size_t
mem_max_seq_len
,
const
size_t
d_model
,
cudaStream_t
stream
);
template
void
invokeTileEncoderResults
(
half
*
tiled_output
,
int
*
tiled_sequence_length
,
const
half
*
output
,
const
int
*
sequence_length
,
const
size_t
batch_size
,
const
size_t
beam_width
,
const
size_t
mem_max_seq_len
,
const
size_t
d_model
,
cudaStream_t
stream
);
template
void
invokeTileEncoderResults
(
half2
*
tiled_output
,
int
*
tiled_sequence_length
,
const
half2
*
output
,
const
int
*
sequence_length
,
const
size_t
batch_size
,
const
size_t
beam_width
,
const
size_t
mem_max_seq_len
,
const
size_t
d_model
,
cudaStream_t
stream
);
#ifdef ENABLE_BF16
template
void
invokeTileEncoderResults
(
__nv_bfloat16
*
tiled_output
,
int
*
tiled_sequence_length
,
const
__nv_bfloat16
*
output
,
const
int
*
sequence_length
,
const
size_t
batch_size
,
const
size_t
beam_width
,
const
size_t
mem_max_seq_len
,
const
size_t
d_model
,
cudaStream_t
stream
);
#endif
__global__
void
insertUnfinishedPath
(
BeamHypotheses
beam_hyps
,
const
bool
*
finished
,
const
float
*
cum_log_probs
,
const
int
batch_size
,
const
int
beam_width
)
{
const
int
bid
=
blockIdx
.
x
;
const
int
tgt_start_idx
=
beam_hyps
.
num_beams
[
bid
];
if
(
beam_hyps
.
is_done
[
bid
])
{
return
;
}
for
(
int
i
=
0
;
i
<
beam_width
;
i
++
)
{
if
(
threadIdx
.
x
==
0
)
{
const
int
src_beam_idx
=
bid
*
beam_width
+
i
;
const
int
tgt_beam_idx
=
bid
*
beam_width
*
2
+
i
+
tgt_start_idx
;
const
int
length
=
beam_hyps
.
sequence_lengths_src
[
src_beam_idx
];
beam_hyps
.
output_ids_tgt
[(
tgt_beam_idx
)
*
(
beam_hyps
.
max_seq_len
+
1
)
+
length
]
=
beam_hyps
.
output_ids_src
[
length
*
batch_size
*
beam_width
+
src_beam_idx
];
if
(
beam_hyps
.
log_probs
!=
nullptr
&&
beam_hyps
.
log_probs_src
!=
nullptr
)
{
beam_hyps
.
log_probs
[(
tgt_beam_idx
)
*
(
beam_hyps
.
max_seq_len
+
1
)
+
length
]
=
beam_hyps
.
log_probs_src
[
length
*
batch_size
*
beam_width
+
src_beam_idx
];
}
int
prev_id
=
beam_hyps
.
parent_ids_src
[
length
*
batch_size
*
beam_width
+
src_beam_idx
];
for
(
int
j
=
length
-
1
;
j
>=
0
;
j
--
)
{
// output_ids_tgt need to use max_seq_len + 1 because its shape is
// [bs, beam_width, max_seq_len + 1]
beam_hyps
.
output_ids_tgt
[(
tgt_beam_idx
)
*
(
beam_hyps
.
max_seq_len
+
1
)
+
j
]
=
beam_hyps
.
output_ids_src
[
j
*
batch_size
*
beam_width
+
bid
*
beam_width
+
prev_id
];
if
(
beam_hyps
.
log_probs
!=
nullptr
&&
beam_hyps
.
log_probs_src
!=
nullptr
)
{
beam_hyps
.
log_probs
[(
tgt_beam_idx
)
*
(
beam_hyps
.
max_seq_len
+
1
)
+
j
]
=
beam_hyps
.
log_probs_src
[
j
*
batch_size
*
beam_width
+
bid
*
beam_width
+
prev_id
];
}
prev_id
=
beam_hyps
.
parent_ids_src
[
j
*
batch_size
*
beam_width
+
bid
*
beam_width
+
prev_id
];
}
beam_hyps
.
sequence_lengths_tgt
[
tgt_beam_idx
]
=
length
;
beam_hyps
.
normed_scores
[
tgt_beam_idx
]
=
apply_length_penalty
(
cum_log_probs
[
src_beam_idx
],
finished
[
src_beam_idx
]
?
length
+
1
:
length
,
beam_hyps
.
length_penalty
);
beam_hyps
.
cum_log_probs
[
tgt_beam_idx
]
=
cum_log_probs
[
src_beam_idx
];
beam_hyps
.
num_beams
[
bid
]
++
;
}
}
}
void
invokeInsertUnfinishedPath
(
BeamHypotheses
beam_hyps
,
const
bool
*
finished
,
const
float
*
cum_log_probs
,
const
int
batch_size
,
const
int
beam_width
,
cudaStream_t
stream
)
{
insertUnfinishedPath
<<<
batch_size
,
256
,
0
,
stream
>>>
(
beam_hyps
,
finished
,
cum_log_probs
,
batch_size
,
beam_width
);
}
}
// namespace turbomind
src/turbomind/kernels/beam_search_topk_kernels.h
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cuda_runtime.h>
#pragma once
namespace
turbomind
{
// In original beam search implementation, if a beam is finished, we set it as finished
// and only continue to do beam search on remain beams (namely, beam_width - 1 beams in next step)
//
// In this implementation, when a beam is finished, we trace the path and record it in output_ids_tgt,
// and also record the normalized scores. And the beam search continue to use `beam_width` beams in
// next step.
//
// After we collect `beam_width` beams, we will sort them by their norm_scores.
struct
BeamHypotheses
{
int
*
output_ids_tgt
=
nullptr
;
int
*
sequence_lengths_tgt
=
nullptr
;
float
*
cum_log_probs
=
nullptr
;
// cum_log
float
*
normed_scores
=
nullptr
;
// cum_log / (length**length_penalty)
float
*
log_probs
=
nullptr
;
// log probs of each generated token
float
*
min_normed_scores
=
nullptr
;
// record the min normed scores for each batch
int
*
num_beams
=
nullptr
;
// the number of finished beams we collect
bool
*
is_done
=
nullptr
;
// Used to set inputs
const
int
*
output_ids_src
;
const
int
*
parent_ids_src
;
const
int
*
sequence_lengths_src
;
const
int
*
end_ids
;
const
float
*
log_probs_src
;
// some variables for kernels
int
step
;
int
ite
;
int
batch_size
;
int
local_batch_size
;
int
max_seq_len
;
float
length_penalty
;
bool
early_stopping
=
true
;
bool
is_return_normed_score
=
true
;
// return normed_cum_log_probs or cum_log_probs
};
template
<
typename
T
>
void
invokeTopkBeamSearch
(
void
*
workspace
,
size_t
&
workspace_size
,
T
*
log_probs
,
int
*
ids
,
BeamHypotheses
*
beam_hyps
,
const
bool
*
finished
,
const
int
*
sequence_lengths
,
const
int
batch_size
,
const
int
beam_width
,
const
int
vocab_size_padded_
,
const
T
diversity_rate
,
const
float
length_penalty
,
const
int
*
end_ids
,
cudaStream_t
stream
);
template
<
typename
T
>
void
invokeTileEncoderResults
(
T
*
tiled_encoder_output
,
int
*
tiled_encoder_sequence_length
,
const
T
*
encoder_output
,
const
int
*
encoder_sequence_length
,
const
size_t
batch_size
,
const
size_t
beam_width
,
const
size_t
mem_max_seq_len
,
const
size_t
d_model
,
cudaStream_t
stream
);
void
invokeInsertUnfinishedPath
(
BeamHypotheses
beam_hyps
,
const
bool
*
finished
,
const
float
*
cum_log_probs
,
const
int
batch_size
,
const
int
beam_width
,
cudaStream_t
stream
);
}
// namespace turbomind
src/turbomind/kernels/bert_preprocess_kernels.cu
View file @
981a4610
...
...
@@ -68,120 +68,6 @@ void invokeGetPaddingOffsetAndCuSeqLens(size_t* h_pinned_token_num,
sync_check_cuda_error
();
}
template
<
typename
T
>
__global__
void
buildEncoderAttentionMaskKernel
(
T
*
attention_mask
,
const
int
*
sequence_lengths
,
const
int
max_seq_len
)
{
// sequence_lengths: [batch_size]
// attention_mask: [batch_size, 1, max_seq_len, max_seq_len]
attention_mask
+=
blockIdx
.
x
*
max_seq_len
*
max_seq_len
;
const
int
length
=
sequence_lengths
[
blockIdx
.
x
];
for
(
int
i
=
threadIdx
.
x
;
i
<
max_seq_len
*
max_seq_len
;
i
+=
blockDim
.
x
)
{
// int row_id = i / max_seq_len;
int
col_id
=
i
%
max_seq_len
;
// if (row_id < length && col_id < length) {
// TODO (bhsueh) check this modification is ok or not on other rmodel
if
(
col_id
<
length
)
{
attention_mask
[
i
]
=
(
T
)(
1.0
f
);
}
else
{
attention_mask
[
i
]
=
(
T
)(
0.0
f
);
}
}
}
template
<
typename
T
>
void
invokeBuildEncoderAttentionMask
(
T
*
attention_mask
,
const
int
*
sequence_lengths
,
const
int
batch_size
,
const
int
max_seq_len
,
cudaStream_t
stream
)
{
buildEncoderAttentionMaskKernel
<<<
batch_size
,
256
,
0
,
stream
>>>
(
attention_mask
,
sequence_lengths
,
max_seq_len
);
}
template
void
invokeBuildEncoderAttentionMask
(
float
*
attention_mask
,
const
int
*
sequence_lengths
,
const
int
batch_size
,
const
int
max_seq_len
,
cudaStream_t
stream
);
template
void
invokeBuildEncoderAttentionMask
(
half
*
attention_mask
,
const
int
*
sequence_lengths
,
const
int
batch_size
,
const
int
max_seq_len
,
cudaStream_t
stream
);
#ifdef ENABLE_FP8
template
void
invokeBuildEncoderAttentionMask
(
__nv_fp8_e4m3
*
attention_mask
,
const
int
*
sequence_lengths
,
const
int
batch_size
,
const
int
max_seq_len
,
cudaStream_t
stream
);
#endif // ENABLE_FP8
#ifdef ENABLE_BF16
template
void
invokeBuildEncoderAttentionMask
(
__nv_bfloat16
*
attention_mask
,
const
int
*
sequence_lengths
,
const
int
batch_size
,
const
int
max_seq_len
,
cudaStream_t
stream
);
#endif
__global__
void
getTrtPaddingOffsetKernel
(
int
*
trt_mha_padding_offset
,
const
int
*
sequence_length
,
const
int
batch_size
)
{
// use for get tensorrt fused mha padding offset
// when we remove the padding
extern
__shared__
int
tmp_offset
[];
if
(
threadIdx
.
x
==
0
)
{
tmp_offset
[
0
]
=
0
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
tmp_offset
[
i
+
1
]
=
tmp_offset
[
i
]
+
sequence_length
[
i
];
}
}
__syncthreads
();
for
(
int
i
=
threadIdx
.
x
;
i
<
batch_size
+
1
;
i
+=
blockDim
.
x
)
{
trt_mha_padding_offset
[
i
]
=
tmp_offset
[
i
];
}
}
void
invokeGetTrtPaddingOffset
(
int
*
trt_mha_padding_offset
,
const
int
*
sequence_length
,
const
int
batch_size
,
cudaStream_t
stream
)
{
getTrtPaddingOffsetKernel
<<<
1
,
256
,
sizeof
(
int
)
*
(
batch_size
+
1
),
stream
>>>
(
trt_mha_padding_offset
,
sequence_length
,
batch_size
);
}
__global__
void
getTrtPaddingOffsetKernel
(
int
*
trt_mha_padding_offset
,
const
int
*
sequence_length
,
const
int
request_batch_size
,
const
int
request_seq_len
)
{
// use for get tensorrt fused mha padding offset
// when we keep the padding
extern
__shared__
int
tmp_offset
[];
if
(
threadIdx
.
x
==
0
)
{
tmp_offset
[
0
]
=
0
;
for
(
int
i
=
0
;
i
<
request_batch_size
;
i
++
)
{
tmp_offset
[
i
*
2
+
1
]
=
tmp_offset
[
i
*
2
]
+
sequence_length
[
i
];
tmp_offset
[
i
*
2
+
2
]
=
request_seq_len
*
(
i
+
1
);
}
}
__syncthreads
();
for
(
int
i
=
threadIdx
.
x
;
i
<
2
*
request_batch_size
+
1
;
i
+=
blockDim
.
x
)
{
trt_mha_padding_offset
[
i
]
=
tmp_offset
[
i
];
}
}
void
invokeGetTrtPaddingOffset
(
int
*
trt_mha_padding_offset
,
const
int
*
sequence_length
,
const
int
request_batch_size
,
const
int
request_seq_len
,
cudaStream_t
stream
)
{
getTrtPaddingOffsetKernel
<<<
1
,
256
,
sizeof
(
int
)
*
(
2
*
request_batch_size
+
1
),
stream
>>>
(
trt_mha_padding_offset
,
sequence_length
,
request_batch_size
,
request_seq_len
);
}
template
<
typename
T
>
__global__
void
rebuild_sequence_length_padding
(
const
T
*
src
,
T
*
dst
,
const
int
*
padding_offset
,
const
int
n
)
{
...
...
@@ -287,183 +173,4 @@ template void invokeRemovePadding(__nv_bfloat16* dst,
cudaStream_t
stream
);
#endif
template
<
typename
T
>
__global__
void
buildRelativeAttentionBias
(
T
*
relative_attention_bias
,
const
T
*
relative_attention_bias_table
,
const
int
head_num
,
const
int
seq_len
,
const
int
num_bucket
,
const
bool
is_bidirectional
,
const
int
max_distance
)
{
const
int
head_id
=
blockIdx
.
x
;
for
(
int
seq_id
=
threadIdx
.
x
;
seq_id
<
seq_len
*
seq_len
;
seq_id
+=
blockDim
.
x
)
{
int
row_id
=
seq_id
/
seq_len
;
int
col_id
=
seq_id
%
seq_len
;
int
relative_position
=
col_id
-
row_id
;
int
relative_buckets
=
0
;
int
tmp_num_bucket
=
num_bucket
;
if
(
is_bidirectional
)
{
tmp_num_bucket
/=
2
;
if
(
relative_position
>
0
)
{
relative_buckets
+=
tmp_num_bucket
;
}
else
{
relative_position
*=
-
1
;
}
}
else
{
relative_position
=
abs
(
relative_position
);
}
int
max_exact
=
tmp_num_bucket
/
2
;
bool
is_small
=
relative_position
<
max_exact
;
int
relative_position_if_large
=
max_exact
+
(
int
)(
logf
(
relative_position
*
1.0
f
/
max_exact
)
/
logf
((
float
)
max_distance
/
max_exact
)
*
(
tmp_num_bucket
-
max_exact
));
relative_position_if_large
=
min
(
relative_position_if_large
,
tmp_num_bucket
-
1
);
relative_buckets
+=
is_small
?
relative_position
:
relative_position_if_large
;
relative_attention_bias
[
head_id
*
seq_len
*
seq_len
+
seq_id
]
=
relative_attention_bias_table
[
head_id
*
num_bucket
+
relative_buckets
];
}
}
template
<
typename
T
>
void
invokeBuildRelativeAttentionBias
(
T
*
relative_attention_bias
,
const
T
*
relative_attention_bias_table
,
const
int
head_num
,
const
int
seq_len
,
const
int
num_bucket
,
const
bool
is_bidirectional
,
const
int
max_distance
,
const
PositionEmbeddingType
position_embedding_type
,
cudaStream_t
stream
)
{
if
(
position_embedding_type
==
PositionEmbeddingType
::
absolute
)
{
return
;
}
dim3
grid
(
head_num
);
dim3
block
(
256
);
buildRelativeAttentionBias
<<<
grid
,
block
,
0
,
stream
>>>
(
relative_attention_bias
,
relative_attention_bias_table
,
head_num
,
seq_len
,
num_bucket
,
is_bidirectional
,
max_distance
);
}
template
void
invokeBuildRelativeAttentionBias
(
float
*
relative_attention_bias
,
const
float
*
relative_attention_bias_table
,
const
int
head_num
,
const
int
seq_len
,
const
int
num_bucket
,
const
bool
is_bidirectional
,
const
int
max_distance
,
const
PositionEmbeddingType
position_embedding_type
,
cudaStream_t
stream
);
template
void
invokeBuildRelativeAttentionBias
(
half
*
relative_attention_bias
,
const
half
*
relative_attention_bias_table
,
const
int
head_num
,
const
int
seq_len
,
const
int
num_bucket
,
const
bool
is_bidirectional
,
const
int
max_distance
,
const
PositionEmbeddingType
position_embedding_type
,
cudaStream_t
stream
);
#ifdef ENABLE_BF16
template
void
invokeBuildRelativeAttentionBias
(
__nv_bfloat16
*
relative_attention_bias
,
const
__nv_bfloat16
*
relative_attention_bias_table
,
const
int
head_num
,
const
int
seq_len
,
const
int
num_bucket
,
const
bool
is_bidirectional
,
const
int
max_distance
,
const
PositionEmbeddingType
position_embedding_type
,
cudaStream_t
stream
);
#endif
#ifdef ENABLE_FP8
template
<
typename
T_OUT
,
typename
T_IN
>
__global__
void
getLastTokenDequantize
(
getLastTokenDequantizeParam
<
T_OUT
,
T_IN
>
param
)
{
param
.
output
[
blockIdx
.
x
*
param
.
d_model
+
threadIdx
.
x
]
=
(
T_OUT
)(
(
float
)
param
.
input
[
blockIdx
.
x
*
param
.
max_seq_len
*
param
.
d_model
+
threadIdx
.
x
]
*
__ldg
(
param
.
input_scale
));
}
template
<
typename
T_OUT
,
typename
T_IN
>
void
invokeGetLastTokenDequantize
(
getLastTokenDequantizeParam
<
T_OUT
,
T_IN
>
param
)
{
FT_CHECK
(
param
.
d_model
<=
1024
);
getLastTokenDequantize
<
T_OUT
,
T_IN
><<<
param
.
batch_size
,
param
.
d_model
,
0
,
param
.
stream
>>>
(
param
);
}
template
void
invokeGetLastTokenDequantize
<
__nv_bfloat16
,
__nv_fp8_e4m3
>(
getLastTokenDequantizeParam
<
__nv_bfloat16
,
__nv_fp8_e4m3
>
param
);
template
<
typename
T_OUT
,
typename
T_IN
,
QUANTIZE_MODE
quantize_mode
>
__global__
void
quantizeMatrixRebuildPadding
(
QuantizeMatrixRebuildPaddingParam
<
T_OUT
,
T_IN
,
quantize_mode
>
param
)
{
for
(
int
i
=
threadIdx
.
x
;
i
<
param
.
d_model
;
i
+=
blockDim
.
x
)
{
int
padded_row_id
=
blockIdx
.
x
+
(
param
.
padding_offset
==
nullptr
?
0
:
param
.
padding_offset
[
blockIdx
.
x
]);
if
(
quantize_mode
==
QUANTIZE_MODE
::
PER_TENSOR
)
{
param
.
dst
[
padded_row_id
*
param
.
d_model
+
i
]
=
(
T_OUT
)((
float
)
param
.
src
[
blockIdx
.
x
*
param
.
d_model
+
i
]
*
__ldg
(
param
.
scale
));
}
else
if
(
quantize_mode
==
QUANTIZE_MODE
::
PER_CHANNEL
)
{
param
.
dst
[
padded_row_id
*
param
.
d_model
+
i
]
=
(
T_OUT
)((
float
)
param
.
src
[
blockIdx
.
x
*
param
.
d_model
+
i
]
*
__ldg
(
param
.
scale
+
i
));
}
}
}
template
<
>
__global__
void
quantizeMatrixRebuildPadding
(
QuantizeMatrixRebuildPaddingParam
<
half
,
__nv_fp8_e4m3
,
QUANTIZE_MODE
::
PER_TENSOR
>
param
)
{
int
padded_row_id
=
blockIdx
.
x
+
(
param
.
padding_offset
==
nullptr
?
0
:
__ldg
(
&
param
.
padding_offset
[
blockIdx
.
x
]));
__nv_fp8x4_e4m3
*
src_ptr
=
((
__nv_fp8x4_e4m3
*
)
param
.
src
)
+
blockIdx
.
x
*
(
param
.
d_model
/
4
);
half2
*
dst_ptr
=
((
half2
*
)
param
.
dst
)
+
padded_row_id
*
(
param
.
d_model
/
2
);
half2
scale
=
cuda_cast
<
half2
>
(
__ldg
(
param
.
scale
));
for
(
int
i
=
threadIdx
.
x
;
i
<
param
.
d_model
/
4
;
i
+=
blockDim
.
x
)
{
half2
val_0
;
half2
val_1
;
fp8x4_e4m3_to_half2
(
&
val_0
,
&
val_1
,
src_ptr
+
i
);
val_0
=
hmul2
(
val_0
,
scale
);
val_1
=
hmul2
(
val_1
,
scale
);
dst_ptr
[
2
*
i
+
0
]
=
val_0
;
dst_ptr
[
2
*
i
+
1
]
=
val_1
;
}
}
template
<
typename
T_OUT
,
typename
T_IN
,
QUANTIZE_MODE
quantize_mode
>
void
invokeQuantizeMatrixRebuildPadding
(
QuantizeMatrixRebuildPaddingParam
<
T_OUT
,
T_IN
,
quantize_mode
>
param
)
{
dim3
grid
(
param
.
token_num
);
dim3
block
(
param
.
d_model
);
FT_CHECK
(
block
.
x
<=
1024
);
if
(
block
.
x
%
4
==
0
)
{
block
.
x
/=
4
;
}
quantizeMatrixRebuildPadding
<<<
grid
,
block
,
0
,
param
.
stream
>>>
(
param
);
}
template
void
invokeQuantizeMatrixRebuildPadding
<
half
,
__nv_fp8_e4m3
,
QUANTIZE_MODE
::
PER_TENSOR
>(
QuantizeMatrixRebuildPaddingParam
<
half
,
__nv_fp8_e4m3
,
QUANTIZE_MODE
::
PER_TENSOR
>
param
);
#endif
}
// namespace turbomind
src/turbomind/kernels/bert_preprocess_kernels.h
View file @
981a4610
...
...
@@ -15,7 +15,6 @@
*/
#pragma once
#include "src/turbomind/kernels/gen_relative_pos_bias.h"
#include "src/turbomind/utils/cuda_utils.h"
#include <cuda_fp16.h>
#include <cuda_runtime.h>
...
...
@@ -46,21 +45,6 @@ inline void invokeGetPaddingOffset(size_t* h_pinned_token_num,
h_pinned_token_num
,
h_token_num
,
tmp_mask_offset
,
nullptr
,
sequence_length
,
batch_size
,
max_seq_len
,
stream
);
}
template
<
typename
T
>
void
invokeBuildEncoderAttentionMask
(
T
*
attention_mask
,
const
int
*
sequence_lengths
,
const
int
batch_size
,
const
int
max_seq_len
,
cudaStream_t
stream
);
void
invokeGetTrtPaddingOffset
(
int
*
trt_mha_padding_offset
,
const
int
*
sequence_length
,
const
int
request_batch_size
,
cudaStream_t
stream
);
void
invokeGetTrtPaddingOffset
(
int
*
trt_mha_padding_offset
,
const
int
*
sequence_length
,
const
int
request_batch_size
,
const
int
request_seq_len
,
cudaStream_t
stream
);
template
<
typename
T
>
void
invokeRebuildPadding
(
T
*
dst
,
const
T
*
src
,
const
int
*
padding_offset
,
const
int
token_num
,
const
int
hidden_dim
,
cudaStream_t
stream
);
...
...
@@ -69,46 +53,4 @@ template<typename T>
void
invokeRemovePadding
(
T
*
dst
,
const
T
*
src
,
const
int
*
padding_offset
,
const
int
token_num
,
const
int
hidden_dim
,
cudaStream_t
stream
);
template
<
typename
T
>
void
invokeBuildRelativeAttentionBias
(
T
*
relative_attention_bias
,
const
T
*
relative_attention_bias_table
,
const
int
head_num
,
const
int
seq_len
,
const
int
num_bucket
,
const
bool
is_bidirectional
,
const
int
max_distance
,
const
PositionEmbeddingType
position_embedding_type
,
cudaStream_t
stream
);
template
<
typename
T_OUT
,
typename
T_IN
>
struct
getLastTokenDequantizeParam
{
T_OUT
*
const
output
;
T_IN
const
*
const
input
;
float
const
*
const
input_scale
;
const
int
batch_size
;
const
int
max_seq_len
;
const
int
d_model
;
cudaStream_t
stream
;
};
template
<
typename
T_OUT
,
typename
T_IN
>
void
invokeGetLastTokenDequantize
(
getLastTokenDequantizeParam
<
T_OUT
,
T_IN
>
param
);
#ifdef ENABLE_FP8
template
<
typename
T_OUT
,
typename
T_IN
,
QUANTIZE_MODE
quantize_mode
>
struct
QuantizeMatrixRebuildPaddingParam
{
T_OUT
*
dst
;
const
T_IN
*
src
;
const
int
*
padding_offset
;
const
int
token_num
;
const
int
d_model
;
const
float
*
scale
;
cudaStream_t
stream
;
};
template
<
typename
T_OUT
,
typename
T_IN
,
QUANTIZE_MODE
quantize_mode
>
void
invokeQuantizeMatrixRebuildPadding
(
QuantizeMatrixRebuildPaddingParam
<
T_OUT
,
T_IN
,
quantize_mode
>
param
);
#endif // ENABLE_FP8
}
// namespace turbomind
src/turbomind/kernels/decoder_masked_multihead_attention.h
View file @
981a4610
...
...
@@ -16,7 +16,6 @@
#pragma once
#include "src/turbomind/layers/attention_layers_fp8/AttentionFP8Weight.h"
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include "src/turbomind/utils/cuda_fp8_utils.h"
#include <cuda_fp16.h>
...
...
src/turbomind/kernels/decoding_kernels.cu
View file @
981a4610
...
...
@@ -21,81 +21,6 @@
namespace
turbomind
{
// static const float HALF_FLT_MAX = 65504.F;
template
<
typename
T
>
__global__
void
decodingInitialize
(
bool
*
finished
,
int
*
sequence_length
,
int
*
word_ids
,
T
*
cum_log_probs
,
const
int
*
sentence_ids
,
const
int
batch_size
,
const
int
beam_width
,
const
int
max_input_length
)
{
const
bool
IS_FP16
=
std
::
is_same
<
T
,
half
>::
value
;
const
T
MAX_T_VAL
=
(
IS_FP16
)
?
(
T
)
HALF_FLT_MAX
:
(
T
)
1e20
f
;
// BF16 and FP32 have the same dynamic range
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
batch_size
*
beam_width
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
finished
[
index
]
=
false
;
sequence_length
[
index
]
=
max_input_length
;
if
(
word_ids
!=
nullptr
)
{
word_ids
[
index
]
=
sentence_ids
[
index
/
beam_width
];
}
cum_log_probs
[
index
]
=
(
index
%
beam_width
==
0
)
?
(
T
)
0.0
f
:
(
T
)
-
MAX_T_VAL
;
}
}
template
<
typename
T
>
void
invokeDecodingInitialize
(
bool
*
finished
,
int
*
sequence_length
,
int
*
word_ids
,
T
*
cum_log_probs
,
const
int
*
sentence_ids
,
const
int
batch_size
,
const
int
beam_width
,
const
int
max_input_length
,
cudaStream_t
stream
)
{
dim3
grid
((
int
)
ceil
(
batch_size
*
beam_width
*
1.0
/
256
));
dim3
block
(
256
);
decodingInitialize
<
T
><<<
grid
,
block
,
0
,
stream
>>>
(
finished
,
sequence_length
,
word_ids
,
cum_log_probs
,
sentence_ids
,
batch_size
,
beam_width
,
max_input_length
);
}
template
void
invokeDecodingInitialize
(
bool
*
finished
,
int
*
sequence_length
,
int
*
word_ids
,
float
*
cum_log_probs
,
const
int
*
sentence_ids
,
const
int
batch_size
,
const
int
beam_width
,
const
int
max_input_length
,
cudaStream_t
stream
);
template
void
invokeDecodingInitialize
(
bool
*
finished
,
int
*
sequence_length
,
int
*
word_ids
,
half
*
cum_log_probs
,
const
int
*
sentence_ids
,
const
int
batch_size
,
const
int
beam_width
,
const
int
max_input_length
,
cudaStream_t
stream
);
#ifdef ENABLE_BF16
template
void
invokeDecodingInitialize
(
bool
*
finished
,
int
*
sequence_length
,
int
*
word_ids
,
__nv_bfloat16
*
cum_log_probs
,
const
int
*
sentence_ids
,
const
int
batch_size
,
const
int
beam_width
,
const
int
max_input_length
,
cudaStream_t
stream
);
#endif
// PROMPT_SRC: 0 --> no prompts, 1 --> from loaded prompts, 2 --> from request prompts
template
<
typename
T
>
__global__
void
embeddingLookupPosEncoding
(
T
*
from_tensor
,
...
...
@@ -364,33 +289,33 @@ void invokePaddingEmbedding(T* padded_embedding_kernel,
vocab_size_padded
);
}
template
void
invokePaddingEmbedding
(
float
*
padded_embedding_kernel
,
float
*
padded_embedding_bias
,
const
float
*
embedding_kernel
,
const
float
*
embedding_bias
,
const
int
hidden_unit
,
const
int
vocab_size
,
const
int
vocab_size_padded
,
cudaStream_t
stream
);
template
void
invokePaddingEmbedding
(
half
*
padded_embedding_kernel
,
half
*
padded_embedding_bias
,
const
half
*
embedding_kernel
,
const
half
*
embedding_bias
,
const
int
hidden_unit
,
const
int
vocab_size
,
const
int
vocab_size_padded
,
cudaStream_t
stream
);
#ifdef ENABLE_BF16
template
void
invokePaddingEmbedding
(
__nv_bfloat16
*
padded_embedding_kernel
,
__nv_bfloat16
*
padded_embedding_bias
,
const
__nv_bfloat16
*
embedding_kernel
,
const
__nv_bfloat16
*
embedding_bias
,
const
int
hidden_unit
,
const
int
vocab_size
,
const
int
vocab_size_padded
,
cudaStream_t
stream
);
#endif
//
template void invokePaddingEmbedding(float* padded_embedding_kernel,
//
float* padded_embedding_bias,
//
const float* embedding_kernel,
//
const float* embedding_bias,
//
const int hidden_unit,
//
const int vocab_size,
//
const int vocab_size_padded,
//
cudaStream_t stream);
//
template void invokePaddingEmbedding(half* padded_embedding_kernel,
//
half* padded_embedding_bias,
//
const half* embedding_kernel,
//
const half* embedding_bias,
//
const int hidden_unit,
//
const int vocab_size,
//
const int vocab_size_padded,
//
cudaStream_t stream);
//
#ifdef ENABLE_BF16
//
template void invokePaddingEmbedding(__nv_bfloat16* padded_embedding_kernel,
//
__nv_bfloat16* padded_embedding_bias,
//
const __nv_bfloat16* embedding_kernel,
//
const __nv_bfloat16* embedding_bias,
//
const int hidden_unit,
//
const int vocab_size,
//
const int vocab_size_padded,
//
cudaStream_t stream);
//
#endif
template
<
typename
T
>
__global__
void
paddingEmbeddingKernel
(
T
*
padded_embedding_kernel
,
...
...
@@ -426,256 +351,28 @@ void invokePaddingEmbeddingKernel(T* padded_embedding_kernel,
padded_embedding_kernel
,
embedding_kernel
,
hidden_unit
,
vocab_size
,
vocab_size_padded
);
}
template
void
invokePaddingEmbeddingKernel
(
float
*
padded_embedding_kernel
,
const
float
*
embedding_kernel
,
const
int
hidden_unit
,
const
int
vocab_size
,
const
int
vocab_size_padded
,
cudaStream_t
stream
);
template
void
invokePaddingEmbeddingKernel
(
half
*
padded_embedding_kernel
,
const
half
*
embedding_kernel
,
const
int
hidden_unit
,
const
int
vocab_size
,
const
int
vocab_size_padded
,
cudaStream_t
stream
);
#ifdef ENABLE_BF16
template
void
invokePaddingEmbeddingKernel
(
__nv_bfloat16
*
padded_embedding_kernel
,
const
__nv_bfloat16
*
embedding_kernel
,
const
int
hidden_unit
,
const
int
vocab_size
,
const
int
vocab_size_padded
,
cudaStream_t
stream
);
#endif
__global__
void
gatherTree
(
gatherTreeParam
param
)
{
// PREFIX SOFT PROMPT
// beam: have six parts
// [prompt | input | input_padding | prompt_padding | generated output | padding (use end_token)]
// parents: have five parts
// [prompt | input | input_padding | prompt_padding | generated output | padding (use 0)]
// step_ids: need to remove prompt, input_padding and prompt_padding
// the shape is [input_length + requested_output_length, bs, beam_width]
// need to transpose to output_ids [bs, beam_width, input_length + requested_output_length]
// max_input_length: input + input_padding + prompt_padding
// P/PROMPT TUNING
// NOTE: input (real ids | prompt virtual ids) have already been preprocessed during embedding lookup, no prompt
// templates now beam: [input (real ids | prompt virtual ids) | input_padding | generated output | padding (use
// end_token)] parents: [input (real ids | prompt virtual ids) | input_padding | generated output | padding (use
// 0)] step_ids: need to remove virtual prompt ids in input ids
// the shape is [input_length (real input length, prompt length) + requested_output_length, bs, beam_width]
// need to transpose to output_ids [bs, beam_width, input_length + requested_output_length]
// max_input_length: input (real ids | prompt virtual ids) + input_padding
const
int
max_input_length
=
param
.
input_lengths
==
nullptr
?
0
:
param
.
max_input_length
;
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
param
.
batch_size
*
param
.
beam_width
;
i
+=
gridDim
.
x
*
blockDim
.
x
)
{
const
int
batch
=
i
/
param
.
beam_width
;
const
int
beam
=
i
%
param
.
beam_width
;
const
int
prompt_len
=
param
.
prefix_soft_prompt_lengths
==
nullptr
?
0
:
param
.
prefix_soft_prompt_lengths
[
batch
];
int
input_len
=
param
.
input_lengths
==
nullptr
?
0
:
param
.
input_lengths
[
i
];
// virtual prompts mean the prompt embedded in input ids (with prompt templates) [p/prompt tuning]
const
int
virtual_prompt_length
=
param
.
p_prompt_tuning_prompt_lengths
==
nullptr
?
0
:
param
.
p_prompt_tuning_prompt_lengths
[
batch
];
// real input length (without virtual prompts) [p/prompt tuning]
input_len
-=
virtual_prompt_length
;
const
int
*
parent_ids
=
param
.
parent_ids
;
const
int
*
step_ids
=
param
.
step_ids
;
// TODO(bhsueh) optimize the reduce_max operation for large beam_width
int
max_len
=
-
1
;
bool
update_response_input_length
=
param
.
response_input_lengths
!=
nullptr
;
// int selected_beam_index = 0;
for
(
int
j
=
0
;
j
<
param
.
beam_width
;
j
++
)
{
int
tmp_len
=
param
.
max_sequence_lengths
[
batch
*
param
.
beam_width
+
j
]
+
param
.
max_sequence_length_final_step
;
// also remove the length of the soft prompts, p_prompt_tuning
param
.
max_sequence_lengths
[
batch
*
param
.
beam_width
+
j
]
=
tmp_len
-
param
.
max_prefix_soft_prompt_length
-
(
param
.
max_input_length
-
param
.
max_input_without_prompt_length
);
// update the response input length
if
(
update_response_input_length
)
{
param
.
response_input_lengths
[
batch
*
param
.
beam_width
+
j
]
=
input_len
-
prompt_len
;
}
if
(
tmp_len
>
max_len
)
{
max_len
=
tmp_len
;
// selected_beam_index = j;
}
}
const
int
max_seq_len_b
=
min
(
param
.
max_time
,
max_len
);
if
(
max_seq_len_b
<=
0
)
{
continue
;
}
#define GET_IX(time_ix, beam_ix) \
(param.batch_size * param.beam_width * (time_ix) + param.beam_width * batch + (beam_ix))
const
int
padding_offset_and_prompt_offset
=
max_input_length
-
input_len
+
prompt_len
;
const
int
initial_tgt_ix
=
GET_IX
(
max_seq_len_b
-
1
-
padding_offset_and_prompt_offset
,
beam
);
const
int
initial_parent_ix
=
GET_IX
(
max_seq_len_b
-
1
,
beam
);
param
.
beams
[
initial_tgt_ix
]
=
__ldg
(
step_ids
+
initial_parent_ix
);
int
parent
=
parent_ids
==
nullptr
?
0
:
__ldg
(
parent_ids
+
initial_parent_ix
)
%
param
.
beam_width
;
bool
found_bad
=
false
;
for
(
int
level
=
max_seq_len_b
-
2
;
level
>=
0
;
--
level
)
{
if
(
level
<
prompt_len
||
(
level
>=
input_len
&&
level
<
max_input_length
))
{
continue
;
}
int
tgt_level
=
level
>=
max_input_length
?
level
-
padding_offset_and_prompt_offset
:
level
-
prompt_len
;
const
int
level_beam_ix
=
GET_IX
(
tgt_level
,
beam
);
const
int
level_parent_ix
=
GET_IX
(
level
,
parent
);
if
(
parent
<
0
||
parent
>
param
.
beam_width
)
{
// param.beams[level_beam_ix] = -1;
param
.
beams
[
level_beam_ix
]
=
param
.
end_tokens
[
batch
];
parent
=
-
1
;
found_bad
=
true
;
}
else
{
param
.
beams
[
level_beam_ix
]
=
__ldg
(
step_ids
+
level_parent_ix
);
parent
=
parent_ids
==
nullptr
?
0
:
__ldg
(
parent_ids
+
level_parent_ix
)
%
param
.
beam_width
;
}
}
// set the padded part as end_token
// input_len
for
(
int
index
=
max_len
-
padding_offset_and_prompt_offset
;
index
<
param
.
max_time
-
param
.
max_prefix_soft_prompt_length
;
++
index
)
{
param
.
beams
[
GET_IX
(
index
,
beam
)]
=
param
.
end_tokens
[
batch
];
}
// Not necessary when using a BeamSearchDecoder, but necessary
// when a user feeds in possibly broken trajectory (i.e., non-eos
// entries in a beam following eos entries).
if
(
!
found_bad
)
{
bool
finished
=
false
;
// skip the step 0 because it is often the start token
int
start_step
=
max_input_length
==
0
?
1
:
max_input_length
;
for
(
int
time
=
start_step
;
time
<
max_seq_len_b
;
++
time
)
{
const
int
level_beam_ix
=
GET_IX
(
time
,
beam
);
if
(
finished
)
{
param
.
beams
[
level_beam_ix
]
=
param
.
end_tokens
[
batch
];
}
else
if
(
param
.
beams
[
level_beam_ix
]
==
param
.
end_tokens
[
batch
])
{
finished
=
true
;
}
}
}
#undef GET_IX
// transpose on output_ids
// remove p_prompt tuning virtual tokens (end tokens)
int
actual_output_length
=
param
.
max_time
-
param
.
max_prefix_soft_prompt_length
-
(
param
.
max_input_length
-
param
.
max_input_without_prompt_length
);
if
(
param
.
output_ids
!=
nullptr
)
{
for
(
int
j
=
0
;
j
<
actual_output_length
;
j
++
)
{
param
.
output_ids
[
i
*
actual_output_length
+
j
]
=
param
.
beams
[
j
*
param
.
batch_size
*
param
.
beam_width
+
i
];
}
}
}
}
void
invokeGatherTree
(
int
*
beams
,
int
*
max_sequence_lengths
,
const
int
max_time
,
const
int
batch_size
,
const
int
beam_width
,
const
int
*
step_ids
,
const
int
*
parent_ids
,
const
int
*
end_tokens
,
cudaStream_t
stream
)
{
gatherTreeParam
param
;
param
.
beams
=
beams
;
param
.
max_sequence_lengths
=
max_sequence_lengths
;
param
.
max_time
=
max_time
;
param
.
batch_size
=
batch_size
;
param
.
beam_width
=
beam_width
;
param
.
step_ids
=
step_ids
;
param
.
parent_ids
=
parent_ids
;
param
.
end_tokens
=
end_tokens
;
param
.
max_input_length
=
1
;
param
.
prefix_soft_prompt_lengths
=
nullptr
;
param
.
stream
=
stream
;
invokeGatherTree
(
param
);
}
void
invokeGatherTree
(
int
*
beams
,
int
*
max_sequence_lengths
,
const
int
max_time
,
const
int
batch_size
,
const
int
beam_width
,
const
int
*
step_ids
,
const
int
*
parent_ids
,
const
int
*
end_tokens
,
const
int
max_input_length
,
cudaStream_t
stream
)
{
gatherTreeParam
param
;
param
.
beams
=
beams
;
param
.
max_sequence_lengths
=
max_sequence_lengths
;
param
.
max_time
=
max_time
;
param
.
batch_size
=
batch_size
;
param
.
beam_width
=
beam_width
;
param
.
step_ids
=
step_ids
;
param
.
parent_ids
=
parent_ids
;
param
.
end_tokens
=
end_tokens
;
param
.
max_input_length
=
max_input_length
;
param
.
prefix_soft_prompt_lengths
=
nullptr
;
param
.
stream
=
stream
;
invokeGatherTree
(
param
);
}
void
invokeGatherTree
(
gatherTreeParam
param
)
{
int
batchbeam
=
param
.
batch_size
*
param
.
beam_width
;
dim3
grid
(
1
),
block
(
batchbeam
);
// though decoder do not support > 1024 for now
if
(
batchbeam
>
1024
)
{
grid
.
x
=
ceil
(
param
.
batch_size
*
param
.
beam_width
/
1024.
);
block
.
x
=
1024
;
}
gatherTree
<<<
grid
,
block
,
0
,
param
.
stream
>>>
(
param
);
}
__global__
void
minusUnfinishedSeqlen
(
int
*
sequence_lengths
,
const
bool
*
finished
,
const
int
token_num
)
{
for
(
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
i
<
token_num
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
if
(
finished
[
i
]
==
false
)
{
sequence_lengths
[
i
]
-=
1
;
}
}
}
void
invokeMinusUnfinishedSeqlen
(
int
*
sequence_lengths
,
const
bool
*
finished
,
const
int
token_num
,
cudaStream_t
stream
)
{
dim3
block
(
min
(
256
,
token_num
));
dim3
grid
(
ceil
(
token_num
/
256.
));
minusUnfinishedSeqlen
<<<
block
,
grid
,
0
,
stream
>>>
(
sequence_lengths
,
finished
,
token_num
);
}
__global__
void
plusUnfinishedSeqlen
(
int
*
sequence_lengths
,
const
bool
*
finished
,
const
int
token_num
)
{
for
(
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
i
<
token_num
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
if
(
finished
[
i
]
==
false
)
{
sequence_lengths
[
i
]
+=
1
;
}
}
}
void
invokePlusUnfinishedSeqlen
(
int
*
sequence_lengths
,
const
bool
*
finished
,
const
int
token_num
,
cudaStream_t
stream
)
{
dim3
block
(
min
(
256
,
token_num
));
dim3
grid
(
ceil
(
token_num
/
256.
));
plusUnfinishedSeqlen
<<<
block
,
grid
,
0
,
stream
>>>
(
sequence_lengths
,
finished
,
token_num
);
}
// template void invokePaddingEmbeddingKernel(float* padded_embedding_kernel,
// const float* embedding_kernel,
// const int hidden_unit,
// const int vocab_size,
// const int vocab_size_padded,
// cudaStream_t stream);
// template void invokePaddingEmbeddingKernel(half* padded_embedding_kernel,
// const half* embedding_kernel,
// const int hidden_unit,
// const int vocab_size,
// const int vocab_size_padded,
// cudaStream_t stream);
// #ifdef ENABLE_BF16
// template void invokePaddingEmbeddingKernel(__nv_bfloat16* padded_embedding_kernel,
// const __nv_bfloat16* embedding_kernel,
// const int hidden_unit,
// const int vocab_size,
// const int vocab_size_padded,
// cudaStream_t stream);
// #endif
template
<
typename
T
>
__global__
void
plusScalar
(
T
*
buf
,
const
T
val
,
const
int
size
)
...
...
@@ -695,112 +392,4 @@ void invokePlusScalar(T* buf, const T val, const int size, cudaStream_t stream)
template
void
invokePlusScalar
(
int
*
buf
,
const
int
val
,
const
int
size
,
cudaStream_t
stream
);
__global__
void
finalize
(
int
*
output_ids
,
int
*
sequence_lengths
,
float
*
cum_log_probs
,
float
*
output_log_probs
,
const
int
*
topk_output_ids
,
const
int
*
topk_sequence_lengths
,
const
float
*
scores
,
const
float
*
topk_cum_log_probs
,
const
float
*
topk_log_probs
,
const
int
*
num_beams
,
const
int
beam_width
,
const
int
max_seq_len
)
{
// output_ids: [bs, beam_width, max_seq_len]
// sequence_lengths: [bs, beam_width]
// cum_log_probs: [bs, beam_width]
// output_log_probs: [bs, beam_width, max_seq_len]
// topk_output_ids: [bs, 2 * beam_width, max_seq_len + 1]
// topk_sequence_lengths: [bs, 2 * beam_width]
// scores: [bs, 2 * beam_width]
// topk_cum_log_probs: [bs, 2 * beam_width]
// topk_log_probs: [bs, 2 * beam_width, max_seq_len + 1]
// num_beams: [bs]
// This kernel do a sorting for scores first, and then put the topk_output_ids
// into output_ids by the rank of scores.
// Note that we remove the start_token (the id at first position) from topk_output_ids
extern
__shared__
char
array
[];
int
*
rank
=
(
int
*
)(
array
);
float
*
s_scores
=
(
float
*
)(
rank
+
beam_width
);
if
(
threadIdx
.
x
<
num_beams
[
blockIdx
.
x
])
{
s_scores
[
threadIdx
.
x
]
=
scores
[
blockIdx
.
x
*
beam_width
*
2
+
threadIdx
.
x
];
}
__syncthreads
();
for
(
int
i
=
0
;
i
<
beam_width
;
i
++
)
{
float
score
=
threadIdx
.
x
<
num_beams
[
blockIdx
.
x
]
?
s_scores
[
threadIdx
.
x
]
:
-
FLT_MAX
;
float
max_score
=
blockReduceMax
<
float
>
(
score
);
if
(
threadIdx
.
x
==
0
)
{
for
(
int
j
=
0
;
j
<
beam_width
*
2
;
j
++
)
{
if
(
s_scores
[
j
]
==
max_score
)
{
rank
[
i
]
=
j
;
s_scores
[
j
]
=
-
FLT_MAX
;
break
;
}
}
}
__syncthreads
();
}
if
(
threadIdx
.
x
<
beam_width
)
{
sequence_lengths
[
blockIdx
.
x
*
beam_width
+
threadIdx
.
x
]
=
topk_sequence_lengths
[
blockIdx
.
x
*
beam_width
*
2
+
rank
[
threadIdx
.
x
]];
if
(
cum_log_probs
!=
nullptr
)
{
cum_log_probs
[
blockIdx
.
x
*
beam_width
+
threadIdx
.
x
]
=
topk_cum_log_probs
[
blockIdx
.
x
*
beam_width
*
2
+
rank
[
threadIdx
.
x
]];
}
}
for
(
int
beam_idx
=
0
;
beam_idx
<
beam_width
;
beam_idx
++
)
{
// start from step 1 to skip the start token
for
(
int
i
=
threadIdx
.
x
;
i
<
sequence_lengths
[
blockIdx
.
x
*
beam_width
+
beam_idx
];
i
+=
blockDim
.
x
)
{
output_ids
[
blockIdx
.
x
*
beam_width
*
max_seq_len
+
beam_idx
*
max_seq_len
+
i
]
=
topk_output_ids
[
blockIdx
.
x
*
(
beam_width
*
2
)
*
(
max_seq_len
+
1
)
+
rank
[
beam_idx
]
*
(
max_seq_len
+
1
)
+
(
i
+
1
)];
if
(
output_log_probs
!=
nullptr
)
{
output_log_probs
[
blockIdx
.
x
*
beam_width
*
max_seq_len
+
beam_idx
*
max_seq_len
+
i
]
=
topk_log_probs
[
blockIdx
.
x
*
(
beam_width
*
2
)
*
(
max_seq_len
+
1
)
+
rank
[
beam_idx
]
*
(
max_seq_len
+
1
)
+
(
i
+
1
)];
}
}
}
}
void
invokeFinalize
(
int
*
output_ids
,
int
*
sequence_lengths
,
float
*
cum_log_probs
,
float
*
output_log_probs
,
const
int
*
topk_output_ids
,
const
int
*
topk_sequence_lengths
,
const
float
*
scores
,
const
float
*
topk_cum_log_probs
,
const
float
*
topk_log_probs
,
const
int
*
num_beams
,
const
int
beam_width
,
const
int
max_seq_len
,
const
int
batch_size
,
cudaStream_t
stream
)
{
dim3
block
(
beam_width
*
2
);
block
.
x
=
(
block
.
x
+
31
)
/
32
*
32
;
FT_CHECK
(
block
.
x
<
1024
);
finalize
<<<
batch_size
,
block
,
beam_width
*
sizeof
(
int
)
+
(
beam_width
*
2
)
*
sizeof
(
float
),
stream
>>>
(
output_ids
,
sequence_lengths
,
cum_log_probs
,
output_log_probs
,
topk_output_ids
,
topk_sequence_lengths
,
scores
,
topk_cum_log_probs
,
topk_log_probs
,
num_beams
,
beam_width
,
max_seq_len
);
}
}
// namespace turbomind
src/turbomind/kernels/decoding_kernels.h
View file @
981a4610
...
...
@@ -22,17 +22,6 @@
namespace
turbomind
{
template
<
typename
T
>
void
invokeDecodingInitialize
(
bool
*
finished
,
int
*
sequence_length
,
int
*
word_ids
,
T
*
cum_log_probs
,
const
int
*
sentence_ids
,
const
int
batch_size
,
const
int
beam_width
,
const
int
max_input_length
,
cudaStream_t
stream
);
// get token from all_ids at step, then lookup from the embedding table
// by the token
template
<
typename
T
>
...
...
@@ -99,72 +88,7 @@ void invokePaddingEmbeddingKernel(T* padded_embedding_kernel,
const
int
vocab_size_padded
,
cudaStream_t
stream
);
void
invokeGatherTree
(
int
*
beams
,
int
*
max_sequence_lengths
,
const
int
max_time
,
const
int
batch_size
,
const
int
beam_width
,
const
int
*
step_ids
,
const
int
*
parent_ids
,
const
int
*
end_tokens
,
cudaStream_t
stream
);
void
invokeGatherTree
(
int
*
beams
,
int
*
max_sequence_lengths
,
const
int
max_time
,
const
int
batch_size
,
const
int
beam_width
,
const
int
*
step_ids
,
const
int
*
parent_ids
,
const
int
*
end_tokens
,
const
int
max_input_length
,
cudaStream_t
stream
);
struct
gatherTreeParam
{
int
*
beams
=
nullptr
;
int
*
max_sequence_lengths
=
nullptr
;
int
max_sequence_length_final_step
=
0
;
const
int
*
input_lengths
=
nullptr
;
// response input lengths (used to slice the ids during postprocessing)
int
*
response_input_lengths
=
nullptr
;
int
max_time
=
0
;
int
batch_size
=
0
;
int
beam_width
=
0
;
const
int
*
step_ids
=
nullptr
;
const
int
*
parent_ids
=
nullptr
;
const
int
*
end_tokens
=
nullptr
;
int
max_input_length
=
0
;
const
int
*
prefix_soft_prompt_lengths
=
nullptr
;
// p_prompt_tuning prompt leangths, used to remove prompts during post-processing
const
int
*
p_prompt_tuning_prompt_lengths
=
nullptr
;
int
max_input_without_prompt_length
=
0
;
// prefix soft prompt
int
max_prefix_soft_prompt_length
=
0
;
int
*
output_ids
=
nullptr
;
cudaStream_t
stream
;
};
void
invokeGatherTree
(
gatherTreeParam
param
);
void
invokeMinusUnfinishedSeqlen
(
int
*
sequence_lengths
,
const
bool
*
finished
,
const
int
token_num
,
cudaStream_t
stream
);
void
invokePlusUnfinishedSeqlen
(
int
*
sequence_lengths
,
const
bool
*
finished
,
const
int
token_num
,
cudaStream_t
stream
);
template
<
typename
T
>
void
invokePlusScalar
(
T
*
buf
,
const
T
val
,
const
int
size
,
cudaStream_t
stream
);
void
invokeFinalize
(
int
*
output_ids
,
int
*
sequence_lengths
,
float
*
cum_log_probs
,
float
*
output_log_probs
,
const
int
*
topk_output_ids
,
const
int
*
topk_sequence_lengths
,
const
float
*
scores
,
const
float
*
topk_cum_log_probs
,
const
float
*
topk_log_probs
,
const
int
*
num_beams
,
const
int
beam_width
,
const
int
max_seq_len
,
const
int
batch_size
,
cudaStream_t
stream
);
}
// namespace turbomind
src/turbomind/kernels/gen_relative_pos_bias.cu
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "cublas_v2.h"
#include "gen_relative_pos_bias.h"
#include "reduce_kernel_utils.cuh"
#include "src/turbomind/kernels/activation_kernels.h"
#include "src/turbomind/utils/cuda_utils.h"
#include <cstdio>
namespace
turbomind
{
/******************* invokeGenRelativePosBias ***********************/
// relative_position_bias_table is [(2*window_size-1)*(2*window_size-1), headNum]
// relative_position_bias is [head_num, window_size^2, window_size^2]
// grid(window_size*window_size, head_num)
// block(window_size*window_size)
template
<
typename
T
,
typename
Tindex
>
__global__
void
gen_relative_pos_bias
(
T
*
relative_position_bias
,
const
T
*
relative_position_bias_table
,
const
Tindex
*
relative_position_bias_index
,
const
int
window_size
,
const
int
head_num
)
{
const
int
h_in_window
=
blockIdx
.
x
/
window_size
;
const
int
w_in_window
=
blockIdx
.
x
%
window_size
;
const
int
h_in_token
=
threadIdx
.
x
/
window_size
;
const
int
w_in_token
=
threadIdx
.
x
%
window_size
;
const
int
head_idx
=
blockIdx
.
y
;
const
int
elements_per_window
=
window_size
*
window_size
;
const
size_t
elements_per_window_2
=
elements_per_window
*
elements_per_window
;
const
size_t
output_idx
=
head_idx
*
elements_per_window_2
+
blockIdx
.
x
*
elements_per_window
+
threadIdx
.
x
;
if
(
output_idx
<
head_num
*
elements_per_window_2
)
{
const
Tindex
idx_in_table
=
relative_position_bias_index
[(
h_in_window
*
window_size
+
w_in_window
)
*
elements_per_window
+
h_in_token
*
window_size
+
w_in_token
];
relative_position_bias
[
output_idx
]
=
relative_position_bias_table
[
idx_in_table
*
head_num
+
head_idx
];
}
}
template
<
typename
T
,
typename
Tindex
>
void
invokeGenRelativePosBias
(
T
*
relative_position_bias
,
const
T
*
relative_position_bias_table
,
const
Tindex
*
relative_position_bias_index
,
const
int
window_size
,
const
int
head_num
,
cudaStream_t
stream
)
{
dim3
grid
(
window_size
*
window_size
,
head_num
);
dim3
block
(
window_size
*
window_size
);
if
(
block
.
x
>
1024
)
{
printf
(
"[ERROR][invokeGenRelativePosBias] window_size*window_size > 1024.
\n
"
);
exit
(
-
1
);
}
gen_relative_pos_bias
<<<
grid
,
block
,
0
,
stream
>>>
(
relative_position_bias
,
relative_position_bias_table
,
relative_position_bias_index
,
window_size
,
head_num
);
}
/******************* invokeGenRelativePosBiasV2 ***********************/
template
<
typename
T
,
typename
Tindex
>
void
invokeGenRelativePosBiasV2
(
T
*
relative_position_bias
,
const
T
*
relative_coords_table
,
const
Tindex
*
relative_position_bias_index
,
const
T
*
cpb_mlp_weight1
,
const
T
*
cpb_mlp_bias1
,
const
T
*
cpb_mlp_weight2
,
const
int
window_size
,
const
int
cpb_mlp_in_dim
,
const
int
cpb_mlp_out_dim
,
const
int
head_num
,
cudaStream_t
stream
)
{
dim3
grid
(
window_size
*
window_size
,
head_num
);
dim3
block
(
window_size
*
window_size
);
if
(
block
.
x
>
1024
)
{
printf
(
"[ERROR][invokeGenRelativePosBias] window_size*window_size > 1024.
\n
"
);
exit
(
-
1
);
}
T
*
relative_position_bias_table
;
check_cuda_error
(
cudaMalloc
(
&
relative_position_bias_table
,
((
2
*
window_size
-
1
)
*
(
2
*
window_size
-
1
)
*
head_num
)
*
sizeof
(
T
)));
T
*
cpb_mlp_1
;
check_cuda_error
(
cudaMalloc
(
&
cpb_mlp_1
,
((
2
*
window_size
-
1
)
*
(
2
*
window_size
-
1
)
*
cpb_mlp_out_dim
)
*
sizeof
(
T
)));
cublasHandle_t
cublas_handle
;
check_cuda_error
(
cublasCreate
(
&
cublas_handle
));
int
m
=
(
2
*
window_size
-
1
)
*
(
2
*
window_size
-
1
);
T
alpha
=
(
T
)
1.0
f
;
T
beta
=
(
T
)
0.0
f
;
cudaDataType_t
type
=
std
::
is_same
<
float
,
T
>::
value
?
CUDA_R_32F
:
CUDA_R_16F
;
#if (CUDART_VERSION >= 11000)
cublasComputeType_t
compute_type
=
std
::
is_same
<
float
,
T
>::
value
?
CUBLAS_COMPUTE_32F
:
CUBLAS_COMPUTE_16F
;
#else
cudaDataType_t
compute_type
=
std
::
is_same
<
float
,
T
>::
value
?
CUDA_R_32F
:
CUDA_R_16F
;
#endif
cublasGemmAlgo_t
algo
=
std
::
is_same
<
float
,
T
>::
value
?
CUBLAS_GEMM_DEFAULT
:
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
check_cuda_error
(
cublasGemmEx
(
cublas_handle
,
CUBLAS_OP_T
,
CUBLAS_OP_N
,
cpb_mlp_out_dim
,
m
,
cpb_mlp_in_dim
,
&
alpha
,
cpb_mlp_weight1
,
type
,
cpb_mlp_in_dim
,
relative_coords_table
,
type
,
cpb_mlp_in_dim
,
&
beta
,
cpb_mlp_1
,
type
,
cpb_mlp_out_dim
,
compute_type
,
algo
));
invokeGenericActivation
<
ReluActivation
,
T
,
T
>
(
cpb_mlp_1
,
cpb_mlp_bias1
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
m
,
cpb_mlp_out_dim
,
0
,
nullptr
,
nullptr
,
stream
);
check_cuda_error
(
cublasGemmEx
(
cublas_handle
,
CUBLAS_OP_T
,
CUBLAS_OP_N
,
head_num
,
m
,
cpb_mlp_out_dim
,
&
alpha
,
cpb_mlp_weight2
,
type
,
cpb_mlp_out_dim
,
cpb_mlp_1
,
type
,
cpb_mlp_out_dim
,
&
beta
,
relative_position_bias_table
,
type
,
head_num
,
compute_type
,
algo
));
gen_relative_pos_bias
<<<
grid
,
block
,
0
,
stream
>>>
(
relative_position_bias
,
relative_position_bias_table
,
relative_position_bias_index
,
window_size
,
head_num
);
invokeSigmoid
(
relative_position_bias
,
window_size
*
window_size
*
window_size
*
window_size
*
head_num
,
16.0
f
,
stream
);
check_cuda_error
(
cudaFree
(
relative_position_bias_table
));
check_cuda_error
(
cudaFree
(
cpb_mlp_1
));
check_cuda_error
(
cublasDestroy
(
cublas_handle
));
}
/******************* instantiation ***********************/
template
void
invokeGenRelativePosBias
(
float
*
relative_position_bias
,
const
float
*
relative_position_bias_table
,
const
int
*
relative_position_bias_index
,
const
int
window_size
,
const
int
head_num
,
cudaStream_t
stream
);
template
void
invokeGenRelativePosBias
(
half
*
relative_position_bias
,
const
half
*
relative_position_bias_table
,
const
int
*
relative_position_bias_index
,
const
int
window_size
,
const
int
head_num
,
cudaStream_t
stream
);
template
void
invokeGenRelativePosBias
(
float
*
relative_position_bias
,
const
float
*
relative_position_bias_table
,
const
int64_t
*
relative_position_bias_index
,
const
int
window_size
,
const
int
head_num
,
cudaStream_t
stream
);
template
void
invokeGenRelativePosBias
(
half
*
relative_position_bias
,
const
half
*
relative_position_bias_table
,
const
int64_t
*
relative_position_bias_index
,
const
int
window_size
,
const
int
head_num
,
cudaStream_t
stream
);
__host__
__device__
uint32_t
pow2_rounddown
(
uint32_t
x
)
{
x
|=
x
>>
1
;
x
|=
x
>>
2
;
x
|=
x
>>
4
;
x
|=
x
>>
8
;
x
|=
x
>>
16
;
x
>>=
1
;
return
x
+
1
;
}
template
<
typename
T
>
__global__
void
generate_alibi_slopes
(
T
*
alibi_slopes
,
const
size_t
num_heads
)
{
if
(
threadIdx
.
x
<
num_heads
)
{
// The nearest power of 2 greater than num_heads followed by HF's implementation.
int
num_heads_pow2
=
pow2_rounddown
(
num_heads
);
// Loop over the attention head.
for
(
int
h
=
threadIdx
.
x
;
h
<
num_heads
;
h
+=
blockDim
.
x
)
{
if
(
h
<
num_heads_pow2
)
{
alibi_slopes
[
h
]
=
static_cast
<
T
>
(
powf
(
powf
(
0.5
f
,
powf
(
0.5
f
,
log2f
(
num_heads_pow2
)
-
3.
f
)),
h
+
1
));
}
else
{
alibi_slopes
[
h
]
=
static_cast
<
T
>
(
powf
(
powf
(
0.5
f
,
powf
(
0.5
f
,
log2f
(
num_heads_pow2
<<
1
)
-
3.
f
)),
(
h
-
num_heads_pow2
)
*
2
+
1
));
}
}
}
}
template
<
typename
T
>
void
invokeBuildAlibiSlopes
(
T
*
alibi_slopes
,
const
size_t
num_heads
,
cudaStream_t
stream
)
{
// Generate the slopes of a linear attention linear bias.
//
// Paper: https://arxiv.org/abs/2108.12409
// HF's implementation
// https://github.com/huggingface/transformers/blob/56ef0ba44765162f830873c140bd40bdc975cc34/src/transformers/models/bloom/modeling_bloom.py#L86
// Author's implementation
// https://github.com/ofirpress/attention_with_linear_biases/blob/02aa87e7a29e9340efd28d6d169018eafb3aa57a/fairseq/models/transformer.py#L760
//
// alibi_slopes: [num_heads],
// strictly follows how HF implements. which treats power-of-2 heads, and non-power-of-2 heads differently.
// what paper generates differs with HF's when number of heads is not a power of 2.
// num_heads: the number of attention heads.
// stream: a cuda stream.
dim3
block
(
min
((
int
)
num_heads
,
512
));
generate_alibi_slopes
<<<
1
,
block
,
0
,
stream
>>>
(
alibi_slopes
,
num_heads
);
}
template
void
invokeBuildAlibiSlopes
(
float
*
alibi_slopes
,
const
size_t
num_heads
,
cudaStream_t
stream
);
template
void
invokeBuildAlibiSlopes
(
half
*
alibi_slopes
,
const
size_t
num_heads
,
cudaStream_t
stream
);
#ifdef ENABLE_BF16
template
void
invokeBuildAlibiSlopes
(
__nv_bfloat16
*
alibi_slopes
,
const
size_t
num_heads
,
cudaStream_t
stream
);
#endif
template
void
invokeGenRelativePosBiasV2
(
float
*
relative_position_bias
,
const
float
*
relative_coords_table
,
const
int
*
relative_position_bias_index
,
const
float
*
cpb_mlp_weight1
,
const
float
*
cpb_mlp_bias1
,
const
float
*
cpb_mlp_weight2
,
const
int
window_size
,
const
int
cpb_mlp_in_dim
,
const
int
cpb_mlp_out_dim
,
const
int
head_num
,
cudaStream_t
stream
);
template
void
invokeGenRelativePosBiasV2
(
half
*
relative_position_bias
,
const
half
*
relative_coords_table
,
const
int
*
relative_position_bias_index
,
const
half
*
cpb_mlp_weight1
,
const
half
*
cpb_mlp_bias1
,
const
half
*
cpb_mlp_weight2
,
const
int
window_size
,
const
int
cpb_mlp_in_dim
,
const
int
cpb_mlp_out_dim
,
const
int
head_num
,
cudaStream_t
stream
);
template
void
invokeGenRelativePosBiasV2
(
float
*
relative_position_bias
,
const
float
*
relative_coords_table
,
const
int64_t
*
relative_position_bias_index
,
const
float
*
cpb_mlp_weight1
,
const
float
*
cpb_mlp_bias1
,
const
float
*
cpb_mlp_weight2
,
const
int
window_size
,
const
int
cpb_mlp_in_dim
,
const
int
cpb_mlp_out_dim
,
const
int
head_num
,
cudaStream_t
stream
);
template
void
invokeGenRelativePosBiasV2
(
half
*
relative_position_bias
,
const
half
*
relative_coords_table
,
const
int64_t
*
relative_position_bias_index
,
const
half
*
cpb_mlp_weight1
,
const
half
*
cpb_mlp_bias1
,
const
half
*
cpb_mlp_weight2
,
const
int
window_size
,
const
int
cpb_mlp_in_dim
,
const
int
cpb_mlp_out_dim
,
const
int
head_num
,
cudaStream_t
stream
);
}
// namespace turbomind
src/turbomind/kernels/gen_relative_pos_bias.h
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/turbomind/utils/cuda_bf16_wrapper.h"
#include <assert.h>
#include <cuda_runtime.h>
#include <stdint.h>
namespace
turbomind
{
enum
class
PositionEmbeddingType
{
relative
,
absolute
,
};
template
<
typename
T
,
typename
Tindex
>
void
invokeGenRelativePosBias
(
T
*
relative_position_bias
,
const
T
*
relative_position_bias_table
,
const
Tindex
*
relative_position_bias_index
,
const
int
window_size
,
const
int
head_num
,
cudaStream_t
stream
);
template
<
typename
T
>
void
invokeBuildAlibiSlopes
(
T
*
linear_position_bias_slopes
,
const
size_t
head_num
,
cudaStream_t
stream
);
template
<
typename
T
,
typename
Tindex
>
void
invokeGenRelativePosBiasV2
(
T
*
relative_position_bias
,
const
T
*
relative_coords_table
,
const
Tindex
*
relative_position_bias_index
,
const
T
*
cpb_mlp_weight1
,
const
T
*
cpb_mlp_bias1
,
const
T
*
cpb_mlp_weight2
,
const
int
window_size
,
const
int
cpb_mlp_in_dim
,
const
int
cpb_mlp_out_dim
,
const
int
head_num
,
cudaStream_t
stream
);
}
// namespace turbomind
src/turbomind/kernels/logprob_kernels.cu
View file @
981a4610
...
...
@@ -182,29 +182,29 @@ void invokeLogProbFromLogits(float* cum_log_probs,
cum_log_probs
,
log_probs
,
input_lengths
,
max_input_length
,
batch_size
,
batch_first
);
}
template
void
invokeLogProbFromLogits
(
float
*
cum_log_probs
,
const
float
*
logits
,
const
int
*
input_ids
,
const
int
*
input_lengths
,
const
size_t
max_input_length
,
const
size_t
batch_size
,
const
size_t
vocab_size
,
const
size_t
vocab_size_padded
,
void
*
workspace
,
const
size_t
workspace_size
,
cudaStream_t
stream
,
const
bool
batch_first
);
template
void
invokeLogProbFromLogits
(
float
*
cum_log_probs
,
const
half
*
logits
,
const
int
*
input_ids
,
const
int
*
input_lengths
,
const
size_t
max_input_length
,
const
size_t
batch_size
,
const
size_t
vocab_size
,
const
size_t
vocab_size_padded
,
void
*
workspace
,
const
size_t
workspace_size
,
cudaStream_t
stream
,
const
bool
batch_first
);
//
template void invokeLogProbFromLogits(float* cum_log_probs,
//
const float* logits,
//
const int* input_ids,
//
const int* input_lengths,
//
const size_t max_input_length,
//
const size_t batch_size,
//
const size_t vocab_size,
//
const size_t vocab_size_padded,
//
void* workspace,
//
const size_t workspace_size,
//
cudaStream_t stream,
//
const bool batch_first);
//
template void invokeLogProbFromLogits(float* cum_log_probs,
//
const half* logits,
//
const int* input_ids,
//
const int* input_lengths,
//
const size_t max_input_length,
//
const size_t batch_size,
//
const size_t vocab_size,
//
const size_t vocab_size_padded,
//
void* workspace,
//
const size_t workspace_size,
//
cudaStream_t stream,
//
const bool batch_first);
}
// end of namespace turbomind
src/turbomind/kernels/online_softmax_beamsearch_kernels.cu
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#elif (CUDART_VERSION >= 11050)
#include <cub/cub.cuh>
#else
#include "3rdparty/cub/cub.cuh"
#endif
#include "src/turbomind/kernels/online_softmax_beamsearch_kernels.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/utils/cuda_utils.h"
namespace
turbomind
{
#define DO_SPLIT_SMALL_TOP_K_SOFTMAX
static
const
int
SMALL_TOP_K_SOFTMAX_THREADBLOCK_SIZE
=
256
;
#define TOPK_FP16_STORAGE 0
template
<
typename
T
>
__device__
__forceinline__
T
apply_length_penalty
(
T
log_prob
,
int
length
,
float
length_penalty
)
{
// score = log(prob) / (length)^length_penalty.
if
(
length_penalty
==
0.0
f
||
length
==
1
)
{
return
log_prob
;
}
return
log_prob
/
static_cast
<
T
>
(
powf
(
length
,
length_penalty
));
}
template
<
typename
T
,
int
MAX_K
,
int
THREADBLOCK_SIZE
>
__launch_bounds__
(
THREADBLOCK_SIZE
)
__global__
void
batch_topK_kernel
(
int
*
topk_tmp_id_buf
,
T
*
topk_tmp_val_buf
,
int
*
id_buf
)
{
int
thread_id
=
threadIdx
.
x
;
int
block_id
=
blockIdx
.
x
;
TopK
<
T
,
MAX_K
>
partial
;
if
(
thread_id
==
0
)
{
for
(
int
i
=
0
;
i
<
MAX_K
;
++
i
)
{
partial
.
p
[
i
]
=
-
1
;
partial
.
u
[
i
]
=
-
FLT_MAX
;
}
int
index
=
block_id
*
MAX_K
*
MAX_K
;
for
(
int
i
=
0
;
i
<
MAX_K
*
MAX_K
;
i
++
)
{
partial
.
insert
((
T
)
topk_tmp_val_buf
[
index
+
i
],
topk_tmp_id_buf
[
index
+
i
]);
}
index
=
block_id
*
MAX_K
;
for
(
int
i
=
0
;
i
<
MAX_K
;
i
++
)
{
id_buf
[
index
+
i
]
=
partial
.
p
[
i
];
}
}
}
template
<
typename
T
,
int
MAX_K
,
int
THREADBLOCK_SIZE
>
__launch_bounds__
(
THREADBLOCK_SIZE
)
__global__
void
batch_topK_kernel
(
const
int
*
__restrict
topk_tmp_id_buf
,
const
T
*
__restrict
topk_tmp_val_buf
,
int
*
__restrict
id_buf
,
T
*
__restrict
val_buf
)
{
int
thread_id
=
threadIdx
.
x
;
int
block_id
=
blockIdx
.
x
;
TopK
<
T
,
MAX_K
>
partial
;
if
(
thread_id
==
0
)
{
for
(
int
i
=
0
;
i
<
MAX_K
;
++
i
)
{
partial
.
p
[
i
]
=
-
1
;
partial
.
u
[
i
]
=
-
FLT_MAX
;
}
int
index
=
block_id
*
MAX_K
*
MAX_K
;
for
(
int
i
=
0
;
i
<
MAX_K
*
MAX_K
;
i
++
)
{
partial
.
insert
((
T
)
topk_tmp_val_buf
[
index
+
i
],
topk_tmp_id_buf
[
index
+
i
]);
}
index
=
block_id
*
MAX_K
;
for
(
int
i
=
0
;
i
<
MAX_K
;
i
++
)
{
id_buf
[
index
+
i
]
=
partial
.
p
[
i
];
val_buf
[
index
+
i
]
=
partial
.
u
[
i
];
}
}
}
template
<
typename
T
,
int
MAX_K
,
int
THREADBLOCK_SIZE
>
__launch_bounds__
(
THREADBLOCK_SIZE
)
__global__
void
batch_topk_kernel
(
const
int
*
__restrict
x
,
const
T
*
__restrict
y
,
int
*
__restrict
z
,
float
*
__restrict
v
,
float
*
output_log_probs
,
const
bool
*
finished
,
const
int
*
sequence_lengths
,
BeamHypotheses
beam_hyps
,
const
int
V
,
const
int
K
,
const
int
vocab_size
,
const
float
length_penalty
,
const
T
diversity_rate
)
{
int
thread_id
=
threadIdx
.
x
;
int
vector_id
=
blockIdx
.
x
;
// reposition x, y to data for the current vector
x
+=
vector_id
*
V
;
y
+=
vector_id
*
V
;
typedef
cub
::
BlockReduce
<
TopK
<
T
,
MAX_K
>
,
THREADBLOCK_SIZE
>
BlockReduce
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
__shared__
int
selected_beams
;
__shared__
float
old_cum_log_probs
[
MAX_K
];
if
(
thread_id
==
0
)
{
selected_beams
=
0
;
}
if
(
thread_id
<
K
)
{
old_cum_log_probs
[
thread_id
]
=
v
[
vector_id
*
K
+
thread_id
];
}
__syncthreads
();
if
(
beam_hyps
.
num_beams
!=
nullptr
)
{
const
int
global_batch_idx
=
beam_hyps
.
ite
*
beam_hyps
.
local_batch_size
+
vector_id
;
if
(
beam_hyps
.
num_beams
[
global_batch_idx
]
==
0
&&
thread_id
==
0
)
{
beam_hyps
.
min_normed_scores
[
global_batch_idx
]
=
FLT_MAX
;
}
else
if
(
beam_hyps
.
num_beams
[
global_batch_idx
]
==
K
)
{
return
;
}
}
TopK
<
T
,
MAX_K
>
partial
;
for
(
int
i
=
0
;
i
<
MAX_K
;
++
i
)
{
partial
.
p
[
i
]
=
-
1
;
partial
.
u
[
i
]
=
-
FLT_MAX
;
}
for
(
int
elem_id
=
thread_id
;
elem_id
<
V
;
elem_id
+=
THREADBLOCK_SIZE
)
{
int
i
=
elem_id
%
K
;
T
elem
=
length_penalty
==
0.0
f
?
y
[
elem_id
]
:
apply_length_penalty
(
y
[
elem_id
],
finished
[
vector_id
]
?
sequence_lengths
[
vector_id
]
:
sequence_lengths
[
vector_id
]
+
1
,
length_penalty
);
elem
+=
diversity_rate
*
(
T
)
i
;
int
elem_idx
=
elem_id
;
// x[elem_id];
partial
.
insert
(
elem
,
elem_idx
);
}
TopK
<
T
,
MAX_K
>
total
=
BlockReduce
(
temp_storage
).
Reduce
(
partial
,
reduce_topk_op
<
T
,
MAX_K
>
);
if
(
thread_id
==
0
)
{
z
+=
vector_id
*
K
;
v
+=
vector_id
*
K
;
for
(
int
i
=
0
;
i
<
MAX_K
;
++
i
)
{
if
(
beam_hyps
.
num_beams
!=
nullptr
&&
x
[
total
.
p
[
i
]]
%
vocab_size
==
beam_hyps
.
end_ids
[
vector_id
])
{
// if beam_token does not belong to top num_beams tokens, it should not be added. Refer from
// https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/generation_beam_search.py#L257
if
(
i
>=
K
)
{
// do nothing
}
else
{
const
int
global_batch_idx
=
beam_hyps
.
ite
*
beam_hyps
.
local_batch_size
+
vector_id
;
const
float
normed_score
=
(
float
)
total
.
u
[
i
];
const
int
num_beam
=
beam_hyps
.
num_beams
[
global_batch_idx
];
int
beam_idx
=
num_beam
;
// If there are beam_width finished sentences, check that the score of selected candidatet
// is higher than min_normed_score or not. If current score is better, replace worst one
// and update the min_normed_score.
if
(
num_beam
==
K
)
{
if
(
normed_score
<
beam_hyps
.
min_normed_scores
[
global_batch_idx
])
{
// end the tracing and exist this for loop
selected_beams
=
K
;
break
;
}
else
{
// find the beam index which's score = min_normed_score, erase it.
for
(
int
j
=
0
;
j
<
K
;
j
++
)
{
if
(
beam_hyps
.
normed_scores
[
global_batch_idx
*
(
K
*
2
)
+
j
]
==
beam_hyps
.
min_normed_scores
[
global_batch_idx
])
{
beam_idx
=
j
;
beam_hyps
.
num_beams
[
global_batch_idx
]
--
;
beam_hyps
.
min_normed_scores
[
global_batch_idx
]
=
FLT_MAX
;
beam_hyps
.
normed_scores
[
global_batch_idx
*
(
K
*
2
)
+
j
]
=
normed_score
;
for
(
int
l
=
0
;
l
<
K
;
l
++
)
{
beam_hyps
.
min_normed_scores
[
global_batch_idx
]
=
min
(
beam_hyps
.
min_normed_scores
[
global_batch_idx
],
beam_hyps
.
normed_scores
[
global_batch_idx
*
(
K
*
2
)
+
l
]);
}
break
;
}
}
}
}
const
int
tgt_id_offset
=
((
vector_id
+
beam_hyps
.
ite
*
beam_hyps
.
local_batch_size
)
*
(
K
*
2
)
+
beam_idx
)
*
(
beam_hyps
.
max_seq_len
);
beam_hyps
.
output_ids_tgt
[
tgt_id_offset
+
beam_hyps
.
step
]
=
beam_hyps
.
end_ids
[
vector_id
];
if
(
beam_hyps
.
log_probs
!=
nullptr
)
{
beam_hyps
.
log_probs
[
tgt_id_offset
+
beam_hyps
.
step
]
=
(
float
)
y
[
total
.
p
[
i
]]
-
old_cum_log_probs
[(
x
[
total
.
p
[
i
]]
/
vocab_size
)
%
K
];
}
int
prev_id
=
(
x
[
total
.
p
[
i
]]
/
vocab_size
)
%
K
;
for
(
int
j
=
beam_hyps
.
step
-
1
;
j
>=
0
;
j
--
)
{
const
int
src_idx
=
j
*
beam_hyps
.
batch_size
*
K
+
beam_hyps
.
ite
*
beam_hyps
.
local_batch_size
*
K
+
vector_id
*
K
+
prev_id
;
beam_hyps
.
output_ids_tgt
[
tgt_id_offset
+
j
]
=
beam_hyps
.
output_ids_src
[
src_idx
];
if
(
beam_hyps
.
log_probs
!=
nullptr
&&
beam_hyps
.
log_probs_src
!=
nullptr
)
{
beam_hyps
.
log_probs
[
tgt_id_offset
+
j
]
=
beam_hyps
.
log_probs_src
[
src_idx
];
}
prev_id
=
beam_hyps
.
parent_ids_src
[
src_idx
];
}
const
int
tgt_beam_idx
=
global_batch_idx
*
(
K
*
2
)
+
beam_idx
;
beam_hyps
.
sequence_lengths_tgt
[
tgt_beam_idx
]
=
beam_hyps
.
step
;
beam_hyps
.
normed_scores
[
tgt_beam_idx
]
=
normed_score
;
beam_hyps
.
min_normed_scores
[
global_batch_idx
]
=
min
(
beam_hyps
.
min_normed_scores
[
global_batch_idx
],
beam_hyps
.
normed_scores
[
tgt_beam_idx
]);
beam_hyps
.
num_beams
[
global_batch_idx
]
++
;
beam_hyps
.
cum_log_probs
[
tgt_beam_idx
]
=
(
float
)
y
[
total
.
p
[
i
]];
}
}
else
if
((
beam_hyps
.
num_beams
!=
nullptr
&&
i
<
2
*
K
)
||
(
beam_hyps
.
num_beams
==
nullptr
&&
i
<
K
))
{
z
[
selected_beams
]
=
x
[
total
.
p
[
i
]];
if
(
output_log_probs
!=
nullptr
)
{
output_log_probs
[
vector_id
*
K
+
selected_beams
]
=
(
float
)
y
[
total
.
p
[
i
]]
-
old_cum_log_probs
[(
z
[
selected_beams
]
/
vocab_size
)
%
K
];
}
v
[
selected_beams
]
=
(
float
)
y
[
total
.
p
[
i
]];
selected_beams
++
;
}
__syncthreads
();
if
(
selected_beams
>=
K
)
{
break
;
}
}
}
if
(
threadIdx
.
x
==
0
&&
beam_hyps
.
num_beams
!=
nullptr
)
{
if
(
beam_hyps
.
num_beams
[
blockIdx
.
x
]
<
K
)
{
beam_hyps
.
is_done
[
blockIdx
.
x
]
=
false
;
}
else
if
(
beam_hyps
.
early_stopping
)
{
beam_hyps
.
is_done
[
blockIdx
.
x
]
=
true
;
}
}
}
struct
__align__
(
8
)
MD
{
float
m
;
float
d
;
};
__device__
__forceinline__
MD
reduce_md_op
(
MD
a
,
MD
b
)
{
bool
a_bigger
=
(
a
.
m
>
b
.
m
);
MD
bigger_m
=
a_bigger
?
a
:
b
;
MD
smaller_m
=
a_bigger
?
b
:
a
;
MD
res
;
res
.
d
=
bigger_m
.
d
+
smaller_m
.
d
*
__expf
(
smaller_m
.
m
-
bigger_m
.
m
);
res
.
m
=
bigger_m
.
m
;
return
res
;
}
template
<
typename
T
,
int
MAX_K
>
struct
TopKMD
{
MD
md
;
TopK
<
T
,
MAX_K
>
topk
;
};
template
<
typename
T
,
int
MAX_K
>
__device__
__forceinline__
TopKMD
<
T
,
MAX_K
>
reduce_topk_md_op
(
const
TopKMD
<
T
,
MAX_K
>&
a
,
const
TopKMD
<
T
,
MAX_K
>&
b
)
{
TopKMD
<
T
,
MAX_K
>
res
;
res
.
md
=
reduce_md_op
(
a
.
md
,
b
.
md
);
res
.
topk
=
reduce_topk_op
(
a
.
topk
,
b
.
topk
);
return
res
;
}
template
<
typename
T
,
int
ITEMS_PER_THREAD
,
int
MAX_K
,
int
THREADBLOCK_SIZE
>
__launch_bounds__
(
THREADBLOCK_SIZE
)
__global__
void
beam_online_softmax_topk_kernel
(
const
T
*
__restrict
x
,
const
T
*
__restrict
b
,
const
float
*
__restrict
c
,
const
bool
*
__restrict
finished
,
int
*
__restrict
z
,
T
*
__restrict
v
,
int
V
,
int
K
,
const
int
*
__restrict
end_ids
)
{
int
thread_id
=
threadIdx
.
x
;
int
vector_id
=
blockIdx
.
x
;
const
bool
IS_FP16
=
std
::
is_same
<
T
,
half
>::
value
;
const
T
MAX_T_VAL
=
(
IS_FP16
)
?
HALF_FLT_MAX
:
FLT_MAX
;
// reposition y to data for the current vector
x
+=
vector_id
*
V
;
typedef
cub
::
BlockReduce
<
TopKMD
<
float
,
MAX_K
>
,
THREADBLOCK_SIZE
>
BlockReduce
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
TopKMD
<
float
,
MAX_K
>
partial
;
bool
finish
=
finished
[
vector_id
];
for
(
int
i
=
0
;
i
<
MAX_K
;
++
i
)
{
partial
.
topk
.
p
[
i
]
=
-
1
;
partial
.
topk
.
u
[
i
]
=
-
MAX_T_VAL
;
}
partial
.
md
.
m
=
-
MAX_T_VAL
;
partial
.
md
.
d
=
0.0
F
;
if
(
finish
)
{
for
(
int
elem_id
=
thread_id
;
elem_id
<
V
;
elem_id
+=
THREADBLOCK_SIZE
)
{
float
elem
=
(
elem_id
==
end_ids
[
vector_id
/
K
])
?
MAX_T_VAL
:
-
MAX_T_VAL
;
MD
new_elem
{
elem
,
1.0
F
};
partial
.
md
=
reduce_md_op
(
partial
.
md
,
new_elem
);
partial
.
topk
.
insert
(
elem
,
elem_id
);
// if (elem_id > THREADBLOCK_SIZE * MAX_K && (elem_id == E)) break;
}
}
else
{
for
(
int
elem_id
=
thread_id
;
elem_id
<
V
;
elem_id
+=
THREADBLOCK_SIZE
)
{
float
elem
=
x
[
elem_id
]
+
b
[
elem_id
];
MD
new_elem
{
elem
,
1.0
F
};
partial
.
md
=
reduce_md_op
(
partial
.
md
,
new_elem
);
partial
.
topk
.
insert
(
elem
,
elem_id
);
}
}
TopKMD
<
float
,
MAX_K
>
total
=
BlockReduce
(
temp_storage
).
Reduce
(
partial
,
reduce_topk_md_op
<
float
,
MAX_K
>
);
if
(
thread_id
==
0
)
{
z
+=
vector_id
*
K
;
v
+=
vector_id
*
K
;
c
+=
vector_id
;
// float d_total_inverse = __fdividef(1.0F, total.md.d);
float
d_total_log
=
logf
(
total
.
md
.
d
);
for
(
int
i
=
0
;
i
<
MAX_K
;
++
i
)
{
// float val = __expf(total.topk.u[i] - total.md.m) * d_total_inverse;
float
val
=
total
.
topk
.
u
[
i
]
-
total
.
md
.
m
-
d_total_log
;
if
(
i
<
K
)
{
z
[
i
]
=
total
.
topk
.
p
[
i
]
+
vector_id
*
V
;
// faster transformer needs absolute id
v
[
i
]
=
val
+
c
[
0
];
}
}
}
}
template
<
typename
T
,
int
ITEMS_PER_THREAD
,
int
MAX_K
,
int
THREADBLOCK_SIZE
>
__launch_bounds__
(
THREADBLOCK_SIZE
,
1
)
__global__
void
beam_online_softmax_topk_stage1_kernel
(
const
T
*
__restrict
x
,
const
T
*
__restrict
b
,
const
bool
*
__restrict
finished
,
float
*
__restrict
t
,
int
V
,
int
K
,
const
int
*
__restrict
end_ids
)
{
int
thread_id
=
threadIdx
.
x
;
int
vector_id
=
blockIdx
.
x
;
// batch beam index.
const
int
PACKED_TOP_KMD_SIZE
=
2
*
MAX_K
+
2
;
const
bool
IS_FP16
=
std
::
is_same
<
T
,
half
>::
value
;
const
T
MAX_T_VAL
=
(
IS_FP16
)
?
HALF_FLT_MAX
:
FLT_MAX
;
// one will have multiple sections per V
const
int
v_local
=
(
V
+
gridDim
.
y
-
1
)
/
gridDim
.
y
;
const
int
section_start
=
v_local
*
blockIdx
.
y
;
int
section_end
=
section_start
+
v_local
;
section_end
=
(
section_end
>
V
)
?
V
:
section_end
;
// reposition x to data for the current vector
x
+=
vector_id
*
V
;
#if TOPK_FP16_STORAGE == 1
typedef
cub
::
BlockReduce
<
TopKMD
<
__half
,
MAX_K
>
,
THREADBLOCK_SIZE
>
BlockReduce
;
#else
typedef
cub
::
BlockReduce
<
TopKMD
<
T
,
MAX_K
>
,
THREADBLOCK_SIZE
>
BlockReduce
;
#endif
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
__shared__
float
buf_s
[
PACKED_TOP_KMD_SIZE
];
// save intermediate result
#if TOPK_FP16_STORAGE == 1
TopKMD
<
__half
,
MAX_K
>
partial
;
#else
TopKMD
<
T
,
MAX_K
>
partial
;
#endif
bool
finish
=
finished
[
vector_id
];
for
(
int
i
=
0
;
i
<
MAX_K
;
++
i
)
{
partial
.
topk
.
p
[
i
]
=
-
1
;
partial
.
topk
.
u
[
i
]
=
-
MAX_T_VAL
;
}
partial
.
md
.
m
=
-
MAX_T_VAL
;
partial
.
md
.
d
=
0.0
F
;
if
(
finish
)
{
#pragma unroll 1
for
(
int
elem_id
=
section_start
+
thread_id
;
elem_id
<
section_end
;
elem_id
+=
THREADBLOCK_SIZE
)
{
float
elem
=
(
elem_id
==
end_ids
[
vector_id
/
K
])
?
MAX_T_VAL
:
-
MAX_T_VAL
;
MD
new_elem
{
elem
,
1.0
F
};
partial
.
md
=
reduce_md_op
(
partial
.
md
,
new_elem
);
partial
.
topk
.
insert
(
elem
,
elem_id
);
}
}
else
{
#pragma unroll 1
for
(
int
elem_id
=
section_start
+
thread_id
;
elem_id
<
section_end
;
elem_id
+=
THREADBLOCK_SIZE
)
{
T
bias
=
b
==
nullptr
?
(
T
)
0.0
f
:
b
[
elem_id
];
// gpt-2 does not use bias
T
elem
=
x
[
elem_id
]
+
bias
;
MD
new_elem
{
elem
,
1.0
F
};
partial
.
md
=
reduce_md_op
(
partial
.
md
,
new_elem
);
partial
.
topk
.
insert
(
elem
,
elem_id
);
}
}
#if TOPK_FP16_STORAGE == 1
TopKMD
<
__half
,
MAX_K
>
total
=
BlockReduce
(
temp_storage
).
Reduce
(
partial
,
reduce_topk_md_op
<
__half
,
MAX_K
>
);
#else
TopKMD
<
T
,
MAX_K
>
total
=
BlockReduce
(
temp_storage
).
Reduce
(
partial
,
reduce_topk_md_op
<
T
,
MAX_K
>
);
#endif
if
(
thread_id
==
0
)
{
for
(
int
i
=
0
;
i
<
2
*
K
;
i
++
)
{
reinterpret_cast
<
int
*>
(
buf_s
)[
i
]
=
total
.
topk
.
p
[
i
]
+
vector_id
*
V
;
// faster transformer needs absolute id
buf_s
[
MAX_K
+
i
]
=
total
.
topk
.
u
[
i
];
}
buf_s
[
2
*
MAX_K
]
=
total
.
md
.
d
;
buf_s
[
2
*
MAX_K
+
1
]
=
total
.
md
.
m
;
}
__syncthreads
();
for
(
int
elem_id
=
thread_id
;
elem_id
<
PACKED_TOP_KMD_SIZE
;
elem_id
+=
THREADBLOCK_SIZE
)
{
t
[
blockIdx
.
x
*
PACKED_TOP_KMD_SIZE
*
gridDim
.
y
+
blockIdx
.
y
*
PACKED_TOP_KMD_SIZE
+
elem_id
]
=
buf_s
[
elem_id
];
}
}
template
<
typename
T
,
int
MAX_K
,
int
THREADBLOCK_SIZE
>
__launch_bounds__
(
THREADBLOCK_SIZE
)
__global__
void
beam_online_softmax_topk_stage2_kernel
(
const
float
*
__restrict
x
,
const
float
*
__restrict
c
,
int
*
__restrict
z
,
T
*
__restrict
v
,
int
K
,
int
parts_per_beam
)
{
const
int
vector_id
=
blockIdx
.
x
;
const
int
thread_id
=
threadIdx
.
x
;
const
int
PACKED_TOP_KMD_SIZE
=
2
*
MAX_K
+
2
;
const
bool
IS_FP16
=
std
::
is_same
<
T
,
half
>::
value
;
const
T
MAX_T_VAL
=
(
IS_FP16
)
?
HALF_FLT_MAX
:
FLT_MAX
;
extern
__shared__
char
buf_s_
[];
// intermediate result
float
*
buf_s
=
reinterpret_cast
<
float
*>
(
buf_s_
);
//__shared__ float buf_s[PACKED_TOP_KMD_SIZE * THREADBLOCK_SIZE]; // intermediate result
typedef
cub
::
BlockReduce
<
TopKMD
<
T
,
MAX_K
>
,
THREADBLOCK_SIZE
>
BlockReduce
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
x
+=
vector_id
*
PACKED_TOP_KMD_SIZE
*
parts_per_beam
;
TopKMD
<
T
,
MAX_K
>
partial
;
for
(
int
i
=
0
;
i
<
MAX_K
;
++
i
)
{
partial
.
topk
.
p
[
i
]
=
-
1
;
partial
.
topk
.
u
[
i
]
=
-
MAX_T_VAL
;
}
partial
.
md
.
m
=
-
MAX_T_VAL
;
partial
.
md
.
d
=
0.0
F
;
// load and unpack into registers through smem
for
(
int
idx
=
thread_id
;
idx
<
PACKED_TOP_KMD_SIZE
*
parts_per_beam
;
idx
+=
THREADBLOCK_SIZE
)
{
buf_s
[
idx
]
=
x
[
idx
];
}
__syncthreads
();
if
(
threadIdx
.
x
<
parts_per_beam
)
{
float
*
b_s
=
buf_s
+
thread_id
*
PACKED_TOP_KMD_SIZE
;
for
(
int
i
=
0
;
i
<
2
*
K
;
i
++
)
{
partial
.
topk
.
p
[
i
]
=
reinterpret_cast
<
int
*>
(
b_s
)[
i
];
partial
.
topk
.
u
[
i
]
=
b_s
[
MAX_K
+
i
];
}
partial
.
md
.
d
=
b_s
[
2
*
MAX_K
];
partial
.
md
.
m
=
b_s
[
2
*
MAX_K
+
1
];
}
__syncthreads
();
TopKMD
<
T
,
MAX_K
>
total
=
BlockReduce
(
temp_storage
).
Reduce
(
partial
,
reduce_topk_md_op
<
T
,
MAX_K
>
);
if
(
thread_id
==
0
)
{
z
+=
vector_id
*
2
*
K
;
v
+=
vector_id
*
2
*
K
;
c
+=
vector_id
;
float
d_total_log
=
logf
(
total
.
md
.
d
);
for
(
int
i
=
0
;
i
<
MAX_K
;
++
i
)
{
float
val
=
(
float
)
total
.
topk
.
u
[
i
]
-
total
.
md
.
m
-
d_total_log
;
if
(
i
<
2
*
K
)
{
z
[
i
]
=
total
.
topk
.
p
[
i
];
v
[
i
]
=
(
float
)
val
+
(
float
)
c
[
0
];
}
}
}
}
template
<
typename
T
,
int
MAX_K
>
void
beam_online_softmax_topk_stage2_kernelLauncher
(
const
float
*
temp_storage
,
const
float
*
cum_log_probs
,
int
*
ids
,
T
*
vals
,
int
batch_size
,
int
beam_width
,
int
parts_per_beam
,
cudaStream_t
stream
)
{
// might rewrite beam_online_softmax_topk_stage2_kernel no to depend on constant block size
// in oreder to reduce compilation time
int
smem_stage2_size
=
parts_per_beam
*
(
2
*
MAX_K
+
2
)
*
sizeof
(
float
);
if
(
parts_per_beam
<=
32
)
{
beam_online_softmax_topk_stage2_kernel
<
T
,
MAX_K
,
32
><<<
batch_size
*
beam_width
,
32
,
smem_stage2_size
,
stream
>>>
(
temp_storage
,
cum_log_probs
,
ids
,
vals
,
beam_width
,
parts_per_beam
);
return
;
}
if
(
parts_per_beam
<=
64
)
{
beam_online_softmax_topk_stage2_kernel
<
T
,
MAX_K
,
64
><<<
batch_size
*
beam_width
,
64
,
smem_stage2_size
,
stream
>>>
(
temp_storage
,
cum_log_probs
,
ids
,
vals
,
beam_width
,
parts_per_beam
);
return
;
}
if
(
parts_per_beam
<=
128
)
{
beam_online_softmax_topk_stage2_kernel
<
T
,
MAX_K
,
128
>
<<<
batch_size
*
beam_width
,
128
,
smem_stage2_size
,
stream
>>>
(
temp_storage
,
cum_log_probs
,
ids
,
vals
,
beam_width
,
parts_per_beam
);
return
;
}
assert
(
0
);
}
template
<
typename
T
,
int
MAX_K
>
void
topK_softMax_kernelLauncher
(
const
T
*
log_probs
,
const
T
*
bias
,
const
bool
*
finished
,
const
int
*
sequence_lengths
,
float
*
cum_log_probs
,
float
*
output_log_probs
,
int
*
ids
,
void
*
temp_storage
,
const
int
temp_storage_size
,
BeamHypotheses
*
beam_hyps
,
const
int
batch_size
,
const
int
beam_width
,
const
int
vocab_size
,
const
int
*
end_ids
,
T
diversity_rate
,
const
float
length_penalty
,
cudaStream_t
stream
)
{
const
int
items_per_thread
=
1
;
const
int
block_sz
=
(
MAX_K
<
16
)
?
(
MAX_K
<
8
)
?
SMALL_TOP_K_SOFTMAX_THREADBLOCK_SIZE
:
128
:
64
;
// const int block_sz = SMALL_TOP_K_SOFTMAX_THREADBLOCK_SIZE;
assert
(
temp_storage_size
%
2
==
0
);
assert
(
temp_storage_size
>=
2
*
batch_size
*
beam_width
*
beam_width
*
2
);
// Beam search needs the sequence lengths of beams to apply length penalty.
assert
(
length_penalty
==
0.0
f
||
sequence_lengths
!=
nullptr
);
const
int
topk_buf_offset
=
ceil
(
batch_size
*
beam_width
*
beam_width
*
2
/
4.
)
*
4
;
int
*
topk_tmp_id_buf
=
reinterpret_cast
<
int
*>
(
temp_storage
);
T
*
topk_tmp_val_buf
=
reinterpret_cast
<
T
*>
(
topk_tmp_id_buf
+
topk_buf_offset
);
float
*
tmp_buffer
=
reinterpret_cast
<
float
*>
(
topk_tmp_val_buf
+
topk_buf_offset
);
#ifdef DO_SPLIT_SMALL_TOP_K_SOFTMAX
int
voc_parts
=
4
;
if
(
batch_size
*
beam_width
<
256
)
{
// Volta has 80 SMs, so we aim for three waves
voc_parts
=
(
240
+
batch_size
*
beam_width
-
1
)
/
(
batch_size
*
beam_width
);
voc_parts
=
std
::
min
(
128
,
voc_parts
);
// we implement up to 128
}
dim3
grid
(
batch_size
*
beam_width
,
voc_parts
);
cudaFuncSetAttribute
(
beam_online_softmax_topk_stage1_kernel
<
T
,
items_per_thread
,
2
*
MAX_K
,
block_sz
>
,
cudaFuncAttributePreferredSharedMemoryCarveout
,
cudaSharedmemCarveoutMaxL1
);
beam_online_softmax_topk_stage1_kernel
<
T
,
items_per_thread
,
2
*
MAX_K
,
block_sz
>
<<<
grid
,
block_sz
,
0
,
stream
>>>
(
log_probs
,
bias
,
finished
,
tmp_buffer
,
vocab_size
,
beam_width
,
end_ids
);
sync_check_cuda_error
();
#endif
if
(
beam_width
>
1
)
{
#ifdef DO_SPLIT_SMALL_TOP_K_SOFTMAX
beam_online_softmax_topk_stage2_kernelLauncher
<
T
,
2
*
MAX_K
>
(
tmp_buffer
,
cum_log_probs
,
topk_tmp_id_buf
,
topk_tmp_val_buf
,
batch_size
,
beam_width
,
voc_parts
,
stream
);
sync_check_cuda_error
();
#else
beam_online_softmax_topk_kernel
<
T
,
items_per_thread
,
MAX_K
,
block_sz
>
<<<
batch_size
*
beam_width
,
block_sz
,
0
,
stream
>>>
(
log_probs
,
bias
,
cum_log_probs
,
finished
,
topk_tmp_id_buf
,
topk_tmp_val_buf
,
vocab_size
,
beam_width
,
end_ids
);
#endif
#if 0
// wrong result with diversity_rate != 0.f
batch_topK_kernel<T, MAX_K, 32><<<batch_size, 32, 0, stream>>>
(topk_tmp_id_buf, topk_tmp_val_buf, ids, cum_log_probs);
#else
// We need 2*MAX_K candidates because at most k candidates are finished, and we
// will not put them into next iteration
batch_topk_kernel
<
T
,
MAX_K
*
2
,
32
><<<
batch_size
,
32
,
0
,
stream
>>>
(
topk_tmp_id_buf
,
topk_tmp_val_buf
,
ids
,
cum_log_probs
,
output_log_probs
,
finished
,
sequence_lengths
,
*
beam_hyps
,
beam_width
*
beam_width
*
2
,
beam_width
,
vocab_size
,
length_penalty
,
diversity_rate
);
sync_check_cuda_error
();
#endif
}
else
{
FT_CHECK
(
false
);
#ifdef DO_SPLIT_SMALL_TOP_K_SOFTMAX
beam_online_softmax_topk_stage2_kernelLauncher
<
float
,
MAX_K
>
(
tmp_buffer
,
cum_log_probs
,
ids
,
cum_log_probs
,
batch_size
,
beam_width
,
voc_parts
,
stream
);
#else
beam_online_softmax_topk_kernel
<
T
,
items_per_thread
,
MAX_K
,
block_sz
>
<<<
batch_size
*
beam_width
,
block_sz
,
0
,
stream
>>>
(
log_probs
,
bias
,
cum_log_probs
,
finished
,
ids
,
cum_log_probs
,
vocab_size
,
beam_width
,
end_ids
);
#endif
}
}
#define CASE_K(K, MAX_K) \
case K ... MAX_K: \
topK_softMax_kernelLauncher<T, MAX_K>(log_probs, \
bias, \
finished, \
sequence_lengths, \
cum_log_probs, \
output_log_probs, \
ids, \
temp_storage, \
temp_storage_size, \
beam_hyps, \
batch_size, \
beam_width, \
vocab_size, \
end_ids, \
diversity_rate, \
length_penalty, \
stream); \
break;
template
<
typename
T
>
void
invokeTopkSoftMax
(
const
T
*
log_probs
,
const
T
*
bias
,
const
bool
*
finished
,
const
int
*
sequence_lengths
,
float
*
cum_log_probs
,
float
*
output_log_probs
,
int
*
ids
,
void
*
temp_storage
,
const
int
temp_storage_size
,
BeamHypotheses
*
beam_hyps
,
const
int
batch_size
,
const
int
beam_width
,
const
int
vocab_size
,
const
int
*
end_ids
,
const
float
diversity_rate
,
const
float
length_penalty
,
cudaStream_t
stream
)
{
switch
(
beam_width
)
{
CASE_K
(
1
,
4
);
CASE_K
(
5
,
8
);
CASE_K
(
9
,
16
);
CASE_K
(
17
,
32
);
CASE_K
(
33
,
64
);
default:
throw
std
::
runtime_error
(
fmtstr
(
"Topk kernel of beam search does not support beam_width=%d"
,
beam_width
));
}
}
#undef CASE_K
template
void
invokeTopkSoftMax
<
float
>(
const
float
*
log_probs
,
const
float
*
bias
,
const
bool
*
finished
,
const
int
*
sequence_lengths
,
float
*
cum_log_probs
,
float
*
output_log_probs
,
int
*
ids
,
void
*
tmp_storage
,
const
int
temp_storage_size
,
BeamHypotheses
*
beam_hyps
,
const
int
batch_size
,
const
int
beam_width
,
const
int
vocab_size
,
const
int
*
end_ids
,
const
float
diversity_rate
,
const
float
length_penalty
,
cudaStream_t
stream
);
template
void
invokeTopkSoftMax
<
half
>(
const
half
*
log_probs
,
const
half
*
bias
,
const
bool
*
finished
,
const
int
*
sequence_lengths
,
float
*
cum_log_probs
,
float
*
output_log_probs
,
int
*
ids
,
void
*
tmp_storage
,
const
int
temp_storage_size
,
BeamHypotheses
*
beam_hyps
,
const
int
batch_size
,
const
int
beam_width
,
const
int
vocab_size
,
const
int
*
end_ids
,
const
float
diversity_rate
,
const
float
length_penalty
,
cudaStream_t
stream
);
}
// end of namespace turbomind
src/turbomind/kernels/online_softmax_beamsearch_kernels.h
deleted
100644 → 0
View file @
83697422
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/turbomind/kernels/beam_search_topk_kernels.h"
namespace
turbomind
{
template
<
typename
T
>
void
invokeTopkSoftMax
(
const
T
*
log_probs
,
const
T
*
bias
,
const
bool
*
finished
,
const
int
*
sequence_lengths
,
float
*
cum_log_probs
,
float
*
output_log_probs
,
int
*
ids
,
void
*
tmp_storage
,
const
int
temp_storage_size
,
BeamHypotheses
*
beam_hyps
,
const
int
batch_size
,
const
int
beam_width
,
const
int
vocab_size
,
const
int
*
end_ids
,
const
float
diversity_rate
,
const
float
length_penalty
,
cudaStream_t
stream
);
}
// namespace turbomind
src/turbomind/layers/CMakeLists.txt
View file @
981a4610
...
...
@@ -14,13 +14,10 @@
cmake_minimum_required
(
VERSION 3.8
)
add_subdirectory
(
beam_search_layers
)
add_subdirectory
(
sampling_layers
)
add_library
(
DynamicDecodeLayer STATIC DynamicDecodeLayer.cc
)
set_property
(
TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
target_link_libraries
(
DynamicDecodeLayer PUBLIC -lcudart
TopKSamplingLayer TopPSamplingLayer
OnlineBeamSearchLayer BeamSearchLayer ban_bad_words stop_criteria
gpt_kernels tensor nvtx_utils
)
target_link_libraries
(
DynamicDecodeLayer PUBLIC -lcudart TopKSamplingLayer
TopPSamplingLayer ban_bad_words stop_criteria gpt_kernels tensor nvtx_utils
)
src/turbomind/layers/DynamicDecodeLayer.cc
View file @
981a4610
...
...
@@ -17,11 +17,9 @@
#include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/kernels/ban_bad_words.h"
#include "src/turbomind/kernels/stop_criteria_kernels.h"
#include "src/turbomind/layers/beam_search_layers/BaseBeamSearchLayer.h"
#include "src/turbomind/layers/beam_search_layers/BeamSearchLayer.h"
#include "src/turbomind/layers/beam_search_layers/OnlineBeamSearchLayer.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/layers/sampling_layers/TopPSamplingLayer.h"
#include "src/turbomind/utils/cuda_utils.h"
namespace
turbomind
{
...
...
@@ -45,37 +43,6 @@ template<typename T>
void
DynamicDecodeLayer
<
T
>::
initialize
()
{
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
online_beamsearch_decode_
=
new
OnlineBeamSearchLayer
<
T
>
(
0
,
// max_batch_size, deprecated
0
,
// local_head_num, deprecated
0
,
// size_per_head, deprecated
0
,
// beam_width, deprecated
vocab_size_
,
vocab_size_padded_
,
0
,
// end_id, deprecated
0.0
f
,
// beam_search_diversity_rate_, deprecated
1.0
f
,
// temperature_, deprecated
0.0
f
,
// len_penalty_, deprecated
1.0
f
,
// repetition_penalty_, deprecated
stream_
,
cublas_wrapper_
,
allocator_
,
is_free_buffer_after_forward_
);
beamsearch_decode_
=
new
BeamSearchLayer
<
T
>
(
0
,
// max_batch_size, deprecated
0
,
// local_head_num, deprecated
0
,
// size_per_head, deprecated
0
,
// beam_width, deprecated
vocab_size_
,
vocab_size_padded_
,
0
,
// end_id, deprecated
0.0
f
,
// beam_search_diversity_rate_, deprecated
1.0
f
,
// temperature_, deprecated
0.0
f
,
// len_penalty_, deprecated
1.0
f
,
// repetition_penalty_, deprecated
stream_
,
cublas_wrapper_
,
allocator_
,
is_free_buffer_after_forward_
);
topk_decode_
=
new
TopKSamplingLayer
<
T
>
(
0
,
vocab_size_
,
...
...
@@ -131,8 +98,6 @@ template<typename T>
DynamicDecodeLayer
<
T
>::~
DynamicDecodeLayer
()
{
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
delete
online_beamsearch_decode_
;
delete
beamsearch_decode_
;
delete
topk_decode_
;
delete
topp_decode_
;
freeBuffer
();
...
...
@@ -284,105 +249,7 @@ void DynamicDecodeLayer<T>::forward(TensorMap* output_tensors, TensorMap* input_
// dynamic decode GPT
if
(
beam_width
>
1
)
{
// Because we still not support batch beam search now, so we need to compute one by one if there are different
// runtime arguments.
const
size_t
dynamic_decode_batch_size
=
has_diff_runtime_args_
?
1
:
local_batch_size
;
const
int
dynamic_decode_total_iteration
=
local_batch_size
/
dynamic_decode_batch_size
;
for
(
uint
dynamic_ite
=
ite
*
dynamic_decode_total_iteration
;
dynamic_ite
<
(
ite
+
1
)
*
dynamic_decode_total_iteration
;
++
dynamic_ite
)
{
const
int
dynamic_id_offset
=
dynamic_ite
*
dynamic_decode_batch_size
*
beam_width
;
const
int
dynamic_decode_vocab_size_units_offset
=
dynamic_id_offset
*
vocab_size_padded_
;
// common inputs
Tensor
logits
=
input_tensors
->
at
(
"logits"
);
Tensor
end_id
=
input_tensors
->
at
(
"end_id"
);
TensorMap
dynamic_decode_input_tensors
(
{{
"logits"
,
Tensor
{
logits
.
where
,
logits
.
type
,
{
dynamic_decode_batch_size
,
logits
.
shape
[
1
],
logits
.
shape
[
2
]},
logits
.
getPtrWithOffset
(
dynamic_decode_vocab_size_units_offset
)}},
{
"step"
,
input_tensors
->
at
(
"step"
)},
{
"max_input_length"
,
input_tensors
->
at
(
"max_input_length"
)},
{
"end_id"
,
Tensor
{
end_id
.
where
,
end_id
.
type
,
{
dynamic_decode_batch_size
},
end_id
.
getPtrWithOffset
(
dynamic_ite
*
dynamic_decode_batch_size
)}},
{
"ite"
,
Tensor
{
MEMORY_CPU
,
TYPE_UINT32
,
{
1
},
&
dynamic_ite
}}});
if
(
input_tensors
->
isExist
(
"embedding_bias"
))
{
dynamic_decode_input_tensors
.
insert
({
"embedding_bias"
,
input_tensors
->
at
(
"embedding_bias"
)});
}
if
(
input_tensors
->
isExist
(
"input_lengths"
))
{
Tensor
input_lengths
=
input_tensors
->
at
(
"input_lengths"
);
dynamic_decode_input_tensors
.
insert
(
{
"input_lengths"
,
input_lengths
.
slice
({
dynamic_decode_batch_size
,
input_lengths
.
shape
[
1
]},
dynamic_id_offset
)});
}
for
(
auto
t
=
input_tensors
->
begin
();
t
!=
input_tensors
->
end
();
++
t
)
{
if
(
t
->
first
.
find
(
"random_seed"
)
==
std
::
string
::
npos
)
{
dynamic_decode_input_tensors
.
insert
(
*
t
);
}
}
// common outputs
TensorMap
dynamic_decode_output_tensors
({{
"output_ids"
,
output_tensors
->
at
(
"output_ids"
)}});
if
(
output_tensors
->
isExist
(
"sequence_length"
))
{
Tensor
sequence_length
=
output_tensors
->
at
(
"sequence_length"
);
dynamic_decode_output_tensors
.
insert
({
"sequence_length"
,
Tensor
{
sequence_length
.
where
,
sequence_length
.
type
,
{
dynamic_decode_batch_size
*
beam_width
},
sequence_length
.
getPtrWithOffset
(
dynamic_id_offset
)}});
}
if
(
output_tensors
->
isExist
(
"finished"
))
{
Tensor
finished
=
output_tensors
->
at
(
"finished"
);
dynamic_decode_output_tensors
.
insert
({
"finished"
,
Tensor
{
finished
.
where
,
finished
.
type
,
{
dynamic_decode_batch_size
*
beam_width
},
finished
.
getPtrWithOffset
(
dynamic_id_offset
)}});
}
if
(
output_tensors
->
isExist
(
"cum_log_probs"
))
{
Tensor
cum_log_probs
=
output_tensors
->
at
(
"cum_log_probs"
);
dynamic_decode_output_tensors
.
insert
({
"cum_log_probs"
,
Tensor
{
cum_log_probs
.
where
,
cum_log_probs
.
type
,
{
dynamic_decode_batch_size
*
beam_width
},
cum_log_probs
.
getPtrWithOffset
(
dynamic_id_offset
)}});
}
if
(
output_tensors
->
isExist
(
"beam_hyps"
))
{
dynamic_decode_output_tensors
.
insert
(
"beam_hyps"
,
output_tensors
->
at
(
"beam_hyps"
));
}
if
(
output_tensors
->
isExist
(
"output_log_probs"
))
{
dynamic_decode_output_tensors
.
insert
({
"output_log_probs"
,
output_tensors
->
at
(
"output_log_probs"
)});
}
dynamic_decode_input_tensors
.
insert
({
"src_cache_indirection"
,
input_tensors
->
at
(
"src_cache_indirection"
)});
dynamic_decode_output_tensors
.
insert
({
"parent_ids"
,
output_tensors
->
at
(
"parent_ids"
)});
dynamic_decode_output_tensors
.
insert
(
{
"tgt_cache_indirection"
,
output_tensors
->
at
(
"tgt_cache_indirection"
)});
FT_CHECK_WITH_INFO
(
dynamic_decode_output_tensors
.
isExist
(
"cum_log_probs"
),
"cum_log_probs should be provided in beam search."
);
if
(
true
||
beam_width
<
16
||
(
output_tensors
->
isExist
(
"beam_hyps"
)
&&
input_tensors
->
getVal
<
float
>
(
"beam_search_diversity_rate"
,
0.0
f
)
!=
0.0
f
))
{
// only online_beamsearch_decode_ support beam_search_diversity_rate when beam_hyps is used
online_beamsearch_decode_
->
forward
(
&
dynamic_decode_output_tensors
,
&
dynamic_decode_input_tensors
);
}
else
{
FT_CHECK
(
false
);
// deprecate this module
beamsearch_decode_
->
forward
(
&
dynamic_decode_output_tensors
,
&
dynamic_decode_input_tensors
);
}
}
// end of dynamic_ite
FT_CHECK_WITH_INFO
(
0
,
"Beam-search is not supported."
);
}
else
{
// beam_width=1
// In sampling, we have supported batch sampling. So, we always compute all sentences once.
...
...
src/turbomind/layers/DynamicDecodeLayer.h
View file @
981a4610
...
...
@@ -19,7 +19,6 @@
#include <string>
#include <unordered_map>
#include "src/turbomind/kernels/beam_search_topk_kernels.h"
#include "src/turbomind/layers/BaseLayer.h"
#include "src/turbomind/layers/DynamicDecodeBaseLayer.h"
#include "src/turbomind/layers/sampling_layers/TopPSamplingLayer.h"
...
...
@@ -34,8 +33,6 @@ protected:
void
initialize
();
bool
hasDiffRuntimeArgs
(
TensorMap
*
input_tensors
);
DynamicDecodeBaseLayer
*
online_beamsearch_decode_
;
DynamicDecodeBaseLayer
*
beamsearch_decode_
;
DynamicDecodeBaseLayer
*
topk_decode_
;
DynamicDecodeBaseLayer
*
topp_decode_
;
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment