OCFD_warp_shuffle.h 2.43 KB
Newer Older
ccfd's avatar
ccfd committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#ifndef __OCFD_WARP_SHUFFLE_H
#define __OCFD_WARP_SHUFFLE_H
#include "cuda_commen.h"
#include "cuda_runtime.h"
#include "cuda.h"
#include "cuda_runtime_api.h"
#ifdef __cplusplus
extern "C"{
#endif


#ifdef __NVCC__
__device__ __forceinline__ double __shfl_up_double(double & val , unsigned char delta , unsigned char width ){
    return __shfl_up_sync(0xffffffff , val , delta , width);
}
__device__ __forceinline__ double __shfl_down_double(double & val , unsigned char delta , unsigned char width ){
    return __shfl_down_sync(0xffffffff , val , delta , width);
}
__device__ __forceinline__ double __shfl_double(double & val , unsigned char srcLane , unsigned char width){
    return __shfl_sync(0xffffffff , val , srcLane , width);
}

__device__ __forceinline__ double __shfl_xor_double(double & val , unsigned char srcLane , unsigned char width){
    return __shfl_xor_sync(0xffffffff , val , srcLane , width);
}

#else

#define __shfl_up_double(val , delta , witdh) __shfl_up_double_( *( (int2*)(&val) ) , delta , witdh)
__device__ __forceinline__ double __shfl_up_double_(int2 & val , unsigned char delta , unsigned char width ){
    int2 out = *( (int2*)(&val) );
    out.x = __shfl_up(out.x , delta , width);
    out.y = __shfl_up(out.y , delta , width);
    return ( *( (double*)(&out) ) );
}

#define __shfl_down_double(val , delta , witdh) __shfl_down_double_( *( (int2*)(&val) ) , delta , witdh)
__device__ __forceinline__ double __shfl_down_double_(int2 & val , unsigned char delta , unsigned char width ){
    int2 out = *( (int2*)(&val) );
    out.x = __shfl_down(out.x , delta , width);
    out.y = __shfl_down(out.y , delta , width);
    return ( *( (double*)(&out) ) );
}

#define __shfl_double(val , delta , witdh) __shfl_double_( *( (int2*)(&val) ) , delta , witdh)
__device__ __forceinline__ double __shfl_double_(int2 & val , unsigned char srcLane , unsigned char width){
    int2 out = *( (int2*)(&val) );
    out.x = __shfl(out.x , srcLane , width);
    out.y = __shfl(out.y , srcLane , width);
    return ( *( (double*)(&out) ) );
}

#define __shfl_xor_double(val , delta , witdh) __shfl_xor_double_( *( (int2*)(&val) ) , delta , witdh)
__device__ __forceinline__ double __shfl_xor_double_(int2 & val , unsigned char srcLane , unsigned char width){
    int2 out = *( (int2*)(&val) );
    out.x = __shfl_xor(out.x , srcLane , width);
    out.y = __shfl_xor(out.y , srcLane , width);
    return ( *( (double*)(&out) ) );
}

#endif


#ifdef __cplusplus
}
#endif
#endif