custom_cuda_kernel.cu 528 Bytes
Newer Older
Jeff Rasley's avatar
Jeff Rasley committed
1
2
3
4
5
6
7
8


#include "custom_cuda_layers.h"

__global__ void param_update_kernel(const float* input, __half* output, int size)
{
    int id = blockIdx.x * blockDim.x + threadIdx.x;

9
    if (id < size) { output[id] = (__half)input[id]; }
Jeff Rasley's avatar
Jeff Rasley committed
10
11
12
13
}

void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream)
{
14
    int threads = 1024;
Jeff Rasley's avatar
Jeff Rasley committed
15
16
17
18
19
20

    dim3 grid_dim((size - 1) / threads + 1);
    dim3 block_dim(threads);

    param_update_kernel<<<grid_dim, block_dim, 0, stream>>>(input, output, size);
}