// Adated from FasterTransformer, https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
#pragma once
#include <assert.h>
#include <stdint.h>
#include <float.h>
#include <cassert>
#include <cstdint>
#include <cfloat>
#include <type_traits>
#include <cstdio>
#include <cuda_fp16.h>
#ifdef ENABLE_BF16
#include <cuda_bf16.h>
#endif
__device____forceinline__
staticvoidtrap_unsupported_arch(){
if(blockIdx.x==0&&blockIdx.y==0&&threadIdx.x==0){
printf("This kernel is not supported on your GPU\n");