Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
82069311
Commit
82069311
authored
Jun 02, 2015
by
Peter Eastman
Browse files
Cleaned up CUDA FFT code
parent
d3e91b15
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
67 additions
and
89 deletions
+67
-89
platforms/cuda/include/CudaKernels.h
platforms/cuda/include/CudaKernels.h
+1
-1
platforms/cuda/src/CudaFFT3D.cpp
platforms/cuda/src/CudaFFT3D.cpp
+18
-39
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+44
-34
platforms/cuda/src/kernels/fft.cu
platforms/cuda/src/kernels/fft.cu
+4
-15
No files found.
platforms/cuda/include/CudaKernels.h
View file @
82069311
...
@@ -665,7 +665,7 @@ private:
...
@@ -665,7 +665,7 @@ private:
std
::
vector
<
std
::
pair
<
int
,
int
>
>
exceptionAtoms
;
std
::
vector
<
std
::
pair
<
int
,
int
>
>
exceptionAtoms
;
double
ewaldSelfEnergy
,
dispersionCoefficient
,
alpha
;
double
ewaldSelfEnergy
,
dispersionCoefficient
,
alpha
;
int
interpolateForceThreads
;
int
interpolateForceThreads
;
bool
hasCoulomb
,
hasLJ
,
usePmeStream
;
bool
hasCoulomb
,
hasLJ
,
usePmeStream
,
useCudaFFT
;
static
const
int
PmeOrder
=
5
;
static
const
int
PmeOrder
=
5
;
};
};
...
...
platforms/cuda/src/CudaFFT3D.cpp
View file @
82069311
...
@@ -168,14 +168,12 @@ static int getSmallestRadix(int size) {
...
@@ -168,14 +168,12 @@ static int getSmallestRadix(int size) {
}
}
CUfunction
CudaFFT3D
::
createKernel
(
int
xsize
,
int
ysize
,
int
zsize
,
int
&
threads
,
int
axis
,
bool
forward
,
bool
inputIsReal
)
{
CUfunction
CudaFFT3D
::
createKernel
(
int
xsize
,
int
ysize
,
int
zsize
,
int
&
threads
,
int
axis
,
bool
forward
,
bool
inputIsReal
)
{
int
maxThreads
=
256
;
//std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
int
maxThreads
=
256
;
// while (maxThreads > 128 && maxThreads-64 >= zsize)
// while (maxThreads > 128 && maxThreads-64 >= zsize)
// maxThreads -= 64;
// maxThreads -= 64;
int
threadsPerBlock
=
zsize
/
getSmallestRadix
(
zsize
);
int
threadsPerBlock
=
zsize
/
getSmallestRadix
(
zsize
);
bool
isCPU
=
false
;
//context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU;
bool
loopRequired
=
(
threadsPerBlock
>
maxThreads
||
isCPU
);
stringstream
source
;
stringstream
source
;
int
blocksPerGroup
=
(
loopRequired
?
1
:
max
(
1
,
maxThreads
/
threadsPerBlock
)
)
;
int
blocksPerGroup
=
max
(
1
,
maxThreads
/
threadsPerBlock
);
int
stage
=
0
;
int
stage
=
0
;
int
L
=
zsize
;
int
L
=
zsize
;
int
m
=
1
;
int
m
=
1
;
...
@@ -201,19 +199,13 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
...
@@ -201,19 +199,13 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
source
<<
"{
\n
"
;
source
<<
"{
\n
"
;
L
=
L
/
radix
;
L
=
L
/
radix
;
source
<<
"// Pass "
<<
(
stage
+
1
)
<<
" (radix "
<<
radix
<<
")
\n
"
;
source
<<
"// Pass "
<<
(
stage
+
1
)
<<
" (radix "
<<
radix
<<
")
\n
"
;
if
(
loopRequired
)
{
if
(
L
*
m
<
threadsPerBlock
)
source
<<
"for (int i = threadIdx.x; i < "
<<
(
L
*
m
)
<<
"; i += blockDim.x) {
\n
"
;
source
<<
"if (threadIdx.x < "
<<
(
blocksPerGroup
*
L
*
m
)
<<
") {
\n
"
;
source
<<
"int base = i;
\n
"
;
else
}
source
<<
"{
\n
"
;
else
{
source
<<
"int block = threadIdx.x/"
<<
(
L
*
m
)
<<
";
\n
"
;
if
(
L
*
m
<
threadsPerBlock
)
source
<<
"int i = threadIdx.x-block*"
<<
(
L
*
m
)
<<
";
\n
"
;
source
<<
"if (threadIdx.x < "
<<
(
blocksPerGroup
*
L
*
m
)
<<
") {
\n
"
;
source
<<
"int base = i+block*"
<<
zsize
<<
";
\n
"
;
else
source
<<
"{
\n
"
;
source
<<
"int block = threadIdx.x/"
<<
(
L
*
m
)
<<
";
\n
"
;
source
<<
"int i = threadIdx.x-block*"
<<
(
L
*
m
)
<<
";
\n
"
;
source
<<
"int base = i+block*"
<<
zsize
<<
";
\n
"
;
}
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
if
(
radix
==
7
)
{
if
(
radix
==
7
)
{
source
<<
"real2 c0 = data"
<<
input
<<
"[base];
\n
"
;
source
<<
"real2 c0 = data"
<<
input
<<
"[base];
\n
"
;
...
@@ -328,27 +320,15 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
...
@@ -328,27 +320,15 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
bool
outputIsReal
=
(
inputIsReal
&&
axis
==
2
&&
!
forward
);
bool
outputIsReal
=
(
inputIsReal
&&
axis
==
2
&&
!
forward
);
bool
outputIsPacked
=
(
inputIsReal
&&
axis
==
2
&&
forward
);
bool
outputIsPacked
=
(
inputIsReal
&&
axis
==
2
&&
forward
);
string
outputSuffix
=
(
outputIsReal
?
".x"
:
""
);
string
outputSuffix
=
(
outputIsReal
?
".x"
:
""
);
if
(
loopRequired
||
true
)
{
if
(
outputIsPacked
)
if
(
outputIsPacked
)
source
<<
"if (index < XSIZE*YSIZE && x < XSIZE/2+1)
\n
"
;
source
<<
"if (index < XSIZE*YSIZE && x < XSIZE/2+1)
\n
"
;
else
else
source
<<
"if (index < XSIZE*YSIZE)
\n
"
;
source
<<
"if (index < XSIZE*YSIZE)
\n
"
;
source
<<
"for (int i = threadIdx.x-block*THREADS_PER_BLOCK; i < ZSIZE; i += THREADS_PER_BLOCK)
\n
"
;
source
<<
"for (int i = threadIdx.x-block*THREADS_PER_BLOCK; i < ZSIZE; i += THREADS_PER_BLOCK)
\n
"
;
if
(
outputIsPacked
)
if
(
outputIsPacked
)
source
<<
"out[y*(ZSIZE*(XSIZE/2+1))+i*(XSIZE/2+1)+x] = data"
<<
(
stage
%
2
)
<<
"[i+block*ZSIZE]"
<<
outputSuffix
<<
";
\n
"
;
source
<<
"out[y*(ZSIZE*(XSIZE/2+1))+i*(XSIZE/2+1)+x] = data"
<<
(
stage
%
2
)
<<
"[i+block*ZSIZE]"
<<
outputSuffix
<<
";
\n
"
;
else
else
source
<<
"out[y*(ZSIZE*XSIZE)+i*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[i+block*ZSIZE]"
<<
outputSuffix
<<
";
\n
"
;
source
<<
"out[y*(ZSIZE*XSIZE)+i*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[i+block*ZSIZE]"
<<
outputSuffix
<<
";
\n
"
;
}
else
{
if
(
outputIsPacked
)
{
source
<<
"if (index < XSIZE*YSIZE && x < XSIZE/2+1)
\n
"
;
source
<<
"out[y*(ZSIZE*(XSIZE/2+1))+(threadIdx.x%ZSIZE)*(XSIZE/2+1)+x] = data"
<<
(
stage
%
2
)
<<
"[threadIdx.x]"
<<
outputSuffix
<<
";
\n
"
;
}
else
{
source
<<
"if (index < XSIZE*YSIZE)
\n
"
;
source
<<
"out[y*(ZSIZE*XSIZE)+(threadIdx.x%ZSIZE)*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[threadIdx.x]"
<<
outputSuffix
<<
";
\n
"
;
}
}
map
<
string
,
string
>
replacements
;
map
<
string
,
string
>
replacements
;
replacements
[
"XSIZE"
]
=
context
.
intToString
(
xsize
);
replacements
[
"XSIZE"
]
=
context
.
intToString
(
xsize
);
replacements
[
"YSIZE"
]
=
context
.
intToString
(
ysize
);
replacements
[
"YSIZE"
]
=
context
.
intToString
(
ysize
);
...
@@ -357,7 +337,6 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
...
@@ -357,7 +337,6 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
replacements
[
"THREADS_PER_BLOCK"
]
=
context
.
intToString
(
threadsPerBlock
);
replacements
[
"THREADS_PER_BLOCK"
]
=
context
.
intToString
(
threadsPerBlock
);
replacements
[
"M_PI"
]
=
context
.
doubleToString
(
M_PI
);
replacements
[
"M_PI"
]
=
context
.
doubleToString
(
M_PI
);
replacements
[
"COMPUTE_FFT"
]
=
source
.
str
();
replacements
[
"COMPUTE_FFT"
]
=
source
.
str
();
replacements
[
"LOOP_REQUIRED"
]
=
(
loopRequired
?
"1"
:
"0"
);
replacements
[
"SIGN"
]
=
(
forward
?
"1"
:
"-1"
);
replacements
[
"SIGN"
]
=
(
forward
?
"1"
:
"-1"
);
replacements
[
"INPUT_TYPE"
]
=
(
inputIsReal
&&
axis
==
0
&&
forward
?
"real"
:
"real2"
);
replacements
[
"INPUT_TYPE"
]
=
(
inputIsReal
&&
axis
==
0
&&
forward
?
"real"
:
"real2"
);
replacements
[
"OUTPUT_TYPE"
]
=
(
outputIsReal
?
"real"
:
"real2"
);
replacements
[
"OUTPUT_TYPE"
]
=
(
outputIsReal
?
"real"
:
"real2"
);
...
@@ -366,6 +345,6 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
...
@@ -366,6 +345,6 @@ CUfunction CudaFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads
replacements
[
"OUTPUT_IS_PACKED"
]
=
(
outputIsPacked
?
"1"
:
"0"
);
replacements
[
"OUTPUT_IS_PACKED"
]
=
(
outputIsPacked
?
"1"
:
"0"
);
CUmodule
module
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
context
.
replaceStrings
(
CudaKernelSources
::
fft
,
replacements
));
CUmodule
module
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
context
.
replaceStrings
(
CudaKernelSources
::
fft
,
replacements
));
CUfunction
kernel
=
context
.
getKernel
(
module
,
"execFFT"
);
CUfunction
kernel
=
context
.
getKernel
(
module
,
"execFFT"
);
threads
=
(
isCPU
?
1
:
blocksPerGroup
*
threadsPerBlock
)
;
threads
=
blocksPerGroup
*
threadsPerBlock
;
return
kernel
;
return
kernel
;
}
}
platforms/cuda/src/CudaKernels.cpp
View file @
82069311
...
@@ -1502,8 +1502,10 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
...
@@ -1502,8 +1502,10 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
if (pmeio != NULL)
if (pmeio != NULL)
delete pmeio;
delete pmeio;
if (hasInitializedFFT) {
if (hasInitializedFFT) {
cufftDestroy(fftForward);
if (useCudaFFT) {
cufftDestroy(fftBackward);
cufftDestroy(fftForward);
cufftDestroy(fftBackward);
}
if (usePmeStream) {
if (usePmeStream) {
cuStreamDestroy(pmeStream);
cuStreamDestroy(pmeStream);
cuEventDestroy(pmeSyncEvent);
cuEventDestroy(pmeSyncEvent);
...
@@ -1694,39 +1696,40 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1694,39 +1696,40 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
// Create required data structures.
// Create required data structures.
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
directPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, cu.getComputeCapability() >= 2.0 ? 2*elementSize : 2*sizeof(long long), "originalPmeGrid");
directPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, cu.getComputeCapability() >= 2.0 ? elementSize : sizeof(long long), "originalPmeGrid");
reciprocalPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "reciprocalPmeGrid");
reciprocalPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*(gridSizeZ/2+1), 2*elementSize, "reciprocalPmeGrid");
cu.addAutoclearBuffer(*directPmeGrid);
cu.addAutoclearBuffer(*directPmeGrid);
pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
fft = new CudaFFT3D(cu, gridSizeX, gridSizeY, gridSizeZ, true);
useCudaFFT = false; // We might switch back in the future, once Nvidia has all their bugs worked out
if (useCudaFFT) {
cufftResult result = cufftPlan3d(&fftForward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C);
cufftResult result = cufftPlan3d(&fftForward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C);
if (result != CUFFT_SUCCESS)
if (result != CUFFT_SUCCESS)
throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
result = cufftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
result = cufftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
if (result != CUFFT_SUCCESS)
if (result != CUFFT_SUCCESS)
throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
cufftSetCompatibilityMode(fftForward, CUFFT_COMPATIBILITY_NATIVE);
cufftSetCompatibilityMode(fftForward, CUFFT_COMPATIBILITY_NATIVE);
cufftSetCompatibilityMode(fftBackward, CUFFT_COMPATIBILITY_NATIVE);
cufftSetCompatibilityMode(fftBackward, CUFFT_COMPATIBILITY_NATIVE);
}
else
fft = new CudaFFT3D(cu, gridSizeX, gridSizeY, gridSizeZ, true);
// Prepare for doing PME on its own stream.
// Prepare for doing PME on its own stream.
int cufftVersion
;
char deviceName[100]
;
cu
fftGetVersion(&cufftVersion
);
cu
DeviceGetName(deviceName, 100, cu.getDevice()
);
usePmeStream =
true;//(cu.getComputeCapability() < 5.0 && numParticles < 130000 && cufftVersion >= 6000 && cufftVersion != 7000); // Workarounds for various CUDA bugs
usePmeStream =
(string(deviceName) != "GeForce GTX 980"); // Using a separate stream is slower on GTX 980
if (usePmeStream) {
if (usePmeStream) {
cuStreamCreate(&pmeStream, CU_STREAM_NON_BLOCKING);
cuStreamCreate(&pmeStream, CU_STREAM_NON_BLOCKING);
cufftSetStream(fftForward, pmeStream);
if (useCudaFFT) {
cufftSetStream(fftBackward, pmeStream);
cufftSetStream(fftForward, pmeStream);
cufftSetStream(fftBackward, pmeStream);
}
CHECK_RESULT(cuEventCreate(&pmeSyncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for NonbondedForce");
CHECK_RESULT(cuEventCreate(&pmeSyncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for NonbondedForce");
int recipForceGroup = force.getReciprocalSpaceForceGroup();
int recipForceGroup = force.getReciprocalSpaceForceGroup();
if (recipForceGroup < 0)
if (recipForceGroup < 0)
...
@@ -1896,11 +1899,15 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1896,11 +1899,15 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, directPmeGrid->getSize());
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, directPmeGrid->getSize());
}
}
// if (cu.getUseDoublePrecision())
if (useCudaFFT) {
// cufftExecD2Z(fftForward, (double*) directPmeGrid->getDevicePointer(), (double2*) reciprocalPmeGrid->getDevicePointer());
if (cu.getUseDoublePrecision())
// else
cufftExecD2Z(fftForward, (double*) directPmeGrid->getDevicePointer(), (double2*) reciprocalPmeGrid->getDevicePointer());
// cufftExecR2C(fftForward, (float*) directPmeGrid->getDevicePointer(), (float2*) reciprocalPmeGrid->getDevicePointer());
else
fft->execFFT(*directPmeGrid, *reciprocalPmeGrid, true);
cufftExecR2C(fftForward, (float*) directPmeGrid->getDevicePointer(), (float2*) reciprocalPmeGrid->getDevicePointer());
}
else {
fft->execFFT(*directPmeGrid, *reciprocalPmeGrid, true);
}
if (includeEnergy) {
if (includeEnergy) {
void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
...
@@ -1914,12 +1921,15 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1914,12 +1921,15 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, cu.getNumAtoms());
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, cu.getNumAtoms());
// if (cu.getUseDoublePrecision())
if (useCudaFFT) {
// cufftExecZ2D(fftBackward, (double2*) reciprocalPmeGrid->getDevicePointer(), (double*) directPmeGrid->getDevicePointer());
if (cu.getUseDoublePrecision())
// else
cufftExecZ2D(fftBackward, (double2*) reciprocalPmeGrid->getDevicePointer(), (double*) directPmeGrid->getDevicePointer());
// cufftExecC2R(fftBackward, (float2*) reciprocalPmeGrid->getDevicePointer(), (float*) directPmeGrid->getDevicePointer());
else
fft->execFFT(*reciprocalPmeGrid, *directPmeGrid, false);
cufftExecC2R(fftBackward, (float2*) reciprocalPmeGrid->getDevicePointer(), (float*) directPmeGrid->getDevicePointer());
}
else {
fft->execFFT(*reciprocalPmeGrid, *directPmeGrid, false);
}
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(),
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer()};
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer()};
...
...
platforms/cuda/src/kernels/fft.cu
View file @
82069311
...
@@ -35,26 +35,15 @@ extern "C" __global__ void execFFT(const INPUT_TYPE* __restrict__ in, OUTPUT_TYP
...
@@ -35,26 +35,15 @@ extern "C" __global__ void execFFT(const INPUT_TYPE* __restrict__ in, OUTPUT_TYP
#if OUTPUT_IS_PACKED
#if OUTPUT_IS_PACKED
if
(
x
<
XSIZE
/
2
+
1
)
{
if
(
x
<
XSIZE
/
2
+
1
)
{
#endif
#endif
//#if LOOP_REQUIRED
if
(
index
<
XSIZE
*
YSIZE
)
if
(
index
<
XSIZE
*
YSIZE
)
for
(
int
i
=
threadIdx
.
x
-
block
*
THREADS_PER_BLOCK
;
i
<
ZSIZE
;
i
+=
THREADS_PER_BLOCK
)
for
(
int
i
=
threadIdx
.
x
-
block
*
THREADS_PER_BLOCK
;
i
<
ZSIZE
;
i
+=
THREADS_PER_BLOCK
)
#if INPUT_IS_REAL
#if INPUT_IS_REAL
data0
[
i
+
block
*
ZSIZE
]
=
make_real2
(
in
[
x
*
(
YSIZE
*
ZSIZE
)
+
y
*
ZSIZE
+
i
],
0
);
data0
[
i
+
block
*
ZSIZE
]
=
make_real2
(
in
[
x
*
(
YSIZE
*
ZSIZE
)
+
y
*
ZSIZE
+
i
],
0
);
#elif INPUT_IS_PACKED
#elif INPUT_IS_PACKED
data0
[
i
+
block
*
ZSIZE
]
=
loadComplexValue
(
in
,
x
,
y
,
i
);
data0
[
i
+
block
*
ZSIZE
]
=
loadComplexValue
(
in
,
x
,
y
,
i
);
#else
#else
data0
[
i
+
block
*
ZSIZE
]
=
in
[
x
*
(
YSIZE
*
ZSIZE
)
+
y
*
ZSIZE
+
i
];
data0
[
i
+
block
*
ZSIZE
]
=
in
[
x
*
(
YSIZE
*
ZSIZE
)
+
y
*
ZSIZE
+
i
];
#endif
#endif
//#else
// if (index < XSIZE*YSIZE && (threadIdx.x%BLOCK_SIZE) < ZSIZE)
// #if INPUT_IS_REAL
// data0[threadIdx.x] = make_real2(in[x*(YSIZE*ZSIZE)+y*ZSIZE+threadIdx.x%BLOCK_SIZE], 0);
// #elif INPUT_IS_PACKED
// data0[threadIdx.x] = loadComplexValue(in, x, y, threadIdx.x%BLOCK_SIZE);
// #else
// data0[threadIdx.x] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+threadIdx.x%BLOCK_SIZE];
// #endif
//#endif
#if OUTPUT_IS_PACKED
#if OUTPUT_IS_PACKED
}
}
#endif
#endif
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment