custom_cuda_kernel.cu 1.25 KB
Newer Older
aiss's avatar
aiss committed
1
2
3
4
5
6
/*
Copyright The Microsoft DeepSpeed Team
*/
#ifdef __HIPCC__
#include "custom_hip_layers.h"
#else
aiss's avatar
aiss committed
7
#include "custom_cuda_layers.h"
aiss's avatar
aiss committed
8
#endif
aiss's avatar
aiss committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
__global__ void param_update_kernel(const float* input, __half* output, int size)
{
    int id = blockIdx.x * blockDim.x + threadIdx.x;

    if (id < size) { output[id] = (__half)input[id]; }
}

void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream)
{
    int threads = 1024;

    dim3 grid_dim((size - 1) / threads + 1);
    dim3 block_dim(threads);

    param_update_kernel<<<grid_dim, block_dim, 0, stream>>>(input, output, size);
}

__global__ void param_update_kernel_half(const float* input, __half* output, int size)
{
    int id = blockIdx.x * blockDim.x + threadIdx.x;
    __half2* output_cast = reinterpret_cast<__half2*>(output);
    if (id < size) {
        float input_f = input[id];
        __half2* input_h = reinterpret_cast<__half2*>(&input_f);
        output_cast[id] = *input_h;
    }
}

void launch_param_update_half(const float* input, __half* output, int size, cudaStream_t stream)
{
    int threads = 1024;
    size /= 2;
    dim3 grid_dim((size - 1) / threads + 1);
    dim3 block_dim(threads);

    param_update_kernel_half<<<grid_dim, block_dim, 0, stream>>>(input, output, size);
}