spatial_cuda_layers.h 857 Bytes
Newer Older
aiss's avatar
aiss committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
/*
Copyright 2022 The Microsoft DeepSpeed Team
*/

#pragma once

#if __CUDA_ARCH__ >= 530
#define HALF_PRECISION_AVAILABLE = 1
#endif

#ifdef __HIPCC__
#include <hip/hip_cooperative_groups.h>
#else
#include <cooperative_groups.h>
#endif 

#include <cuda.h>
#include <cuda_fp16.h>

/*********** Group Norm Kernels, Structs, and Helpers ************/

struct {
    int64_t batch_size;
    int64_t seq_len;
    int64_t channels;
} typedef ChannelsLastProblem;

void launch_opt_bias_add(__half* result,
                         const __half* activation,
                         const __half* bias,
                         const __half* other,
                         const __half* other_bias,
                         int batch_size,
                         int seq_len,
                         int channels,
                         cudaStream_t stream);