llama_decoder_kernels.h 304 Bytes
Newer Older
Li Zhang's avatar
Li Zhang committed
1
2
3
4
5
6
7
8
9
10
11
// Copyright (c) OpenMMLab. All rights reserved.

#include <cuda_runtime.h>

namespace fastertransformer {

template<typename T>
void invokeFusedAddResidualRMSNorm(
    T* residual, T* inout, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream);

}  // namespace fastertransformer