booldispatch_additive_masked_softmax_dropout(output_t*dst,uint8_t*dropout_mask,constinput_t*src,constinput_t*pad_mask,inttotalElements,intsoftmax_elements,intsoftmax_elements_stride,intbatch_count,intpad_batch_stride,floatp,cudaStream_tstreamid)// p is the probability to keep, not drop
{
if(softmax_elements==0){
returntrue;
}elseif(softmax_elements<=2048){
// compute function index. there's a function for each power of two size up to 1024.