#pragma once #ifdef __HIP_PLATFORM_HCC__ #define HALF_PRECISION_AVAILABLE = 1 #include #else #if __CUDA_ARCH__ >= 700 #define HALF_PRECISION_AVAILABLE = 1 #endif #include #endif #include #include #include #include #include #include #define MAX_WARP_NUM 32 #define WARP_SIZE 32 #define SMs 80 #define MAX_REGISTERS 256 template void launch_attn_softmax_v2(T* vals, T* mask, bool triangular, bool recompute, bool local_attention, int window_size, int batch_size, int heads, int num_seq, int sequence_length, float scale, cudaStream_t stream); // Fused bias add with gelu activation template void launch_bias_gelu(T* input, const T* bias, int intermediate_size, int batch_size, cudaStream_t stream); template void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, cudaStream_t stream); template void launch_bias_residual(T* input, T* output, T* attn, T* bias, T* attn_bias, int batch, int hidden_dim, int mp_size, cudaStream_t stream); template void launch_layer_norm(T* out, T* vals, const T* gamma, const T* beta, float epsilon, int batch_size, int hidden_dim, cudaStream_t stream); template void launch_residual_layer_norm(T* norm, T* res_add, T* vals, T* residual, const T* bias, const T* gamma, const T* beta, float epsilon, int batch_size, int hidden_dim, bool preLN, bool mlp_after_attn, cudaStream_t stream); template void launch_dequantize(T* output, const int8_t* input, const float* qscale, unsigned output_size, unsigned hidden_dim, unsigned groups, unsigned merge_count, cudaStream_t stream); template void launch_gptj_residual_add(T* input, T* output, T* attn, T* bias, T* attn_bias, int batch, int head_size, int mp_size, cudaStream_t stream); template void launch_apply_rotary_pos_emb(T* mixed_query, T* key_layer, unsigned head_size, unsigned seq_len, unsigned rotary_dim, unsigned offset, unsigned num_heads, unsigned batch, bool rotate_half, bool rotate_every_two, cudaStream_t stream); template void launch_moe_res_matmul(T* residual, T* coef, T* mlp_out, int seq_len, int hidden_dim, cudaStream_t stream);