Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
97ebc356
Commit
97ebc356
authored
Jun 03, 2015
by
peastman
Browse files
Merge pull request #944 from peastman/fft
Ported OpenCL FFT to CUDA
parents
3e5ca417
ad0e95ab
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
869 additions
and
56 deletions
+869
-56
platforms/cuda/include/CudaFFT3D.h
platforms/cuda/include/CudaFFT3D.h
+101
-0
platforms/cuda/include/CudaKernels.h
platforms/cuda/include/CudaKernels.h
+4
-2
platforms/cuda/src/CudaFFT3D.cpp
platforms/cuda/src/CudaFFT3D.cpp
+350
-0
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+49
-54
platforms/cuda/src/kernels/fft.cu
platforms/cuda/src/kernels/fft.cu
+53
-0
platforms/cuda/src/kernels/fftR2C.cu
platforms/cuda/src/kernels/fftR2C.cu
+172
-0
platforms/cuda/tests/TestCudaFFT3D.cpp
platforms/cuda/tests/TestCudaFFT3D.cpp
+140
-0
No files found.
platforms/cuda/include/CudaFFT3D.h
0 → 100644
View file @
97ebc356
#ifndef __OPENMM_CUDAFFT3D_H__
#define __OPENMM_CUDAFFT3D_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2015 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaArray.h"
namespace
OpenMM
{
/**
* This class performs three dimensional Fast Fourier Transforms. It is based on the
* mixed radix algorithm described in
* <p>
* Takahashi, D. and Kanada, Y., "High-Performance Radix-2, 3 and 5 Parallel 1-D Complex
* FFT Algorithms for Distributed-Memory Parallel Computers." Journal of Supercomputing,
* 15, 207–228 (2000).
* <p>
* This class places certain restrictions on the allowed dimensions of the grid. First,
* the size of each dimension may have no prime factors other than 2, 3, 5, and 7. You
* can call findLegalDimension() to determine the smallest size that satisfies this
* requirement and is greater than or equal to a specified minimum size. Second, the size
* of each dimension must be small enough to compute each 1D transform entirely in local
* memory with one work unit per data point. This will vary between platforms, but is
* typically at least 512.
* <p>
* Note that this class performs an unnormalized transform. That means that if you perform
* a forward transform followed immediately by an inverse transform, the effect is to
* multiply every value of the original data set by the total number of data points.
*/
class
OPENMM_EXPORT_CUDA
CudaFFT3D
{
public:
/**
* Create an CudaFFT3D object for performing transforms of a particular size.
*
* @param context the context in which to perform calculations
* @param xsize the first dimension of the data sets on which FFTs will be performed
* @param ysize the second dimension of the data sets on which FFTs will be performed
* @param zsize the third dimension of the data sets on which FFTs will be performed
* @param realToComplex if true, a real-to-complex transform will be done. Otherwise, it is complex-to-complex.
*/
CudaFFT3D
(
CudaContext
&
context
,
int
xsize
,
int
ysize
,
int
zsize
,
bool
realToComplex
=
false
);
/**
* Perform a Fourier transform. The transform cannot be done in-place: the input and output
* arrays must be different. Also, the input array is used as workspace, so its contents
* are destroyed. This also means that both arrays must be large enough to hold complex values,
* even when performing a real-to-complex transform.
* <p>
* When performing a real-to-complex transform, the output data is of size xsize*ysize*(zsize/2+1)
* and contains only the non-redundant elements.
*
* @param in the data to transform, ordered such that in[x*ysize*zsize + y*zsize + z] contains element (x, y, z)
* @param out on exit, this contains the transformed data
* @param forward true to perform a forward transform, false to perform an inverse transform
*/
void
execFFT
(
CudaArray
&
in
,
CudaArray
&
out
,
bool
forward
=
true
);
/**
* Get the smallest legal size for a dimension of the grid (that is, a size with no prime
* factors other than 2, 3, 5, and 7).
*
* @param minimum the minimum size the return value must be greater than or equal to
*/
static
int
findLegalDimension
(
int
minimum
);
private:
CUfunction
createKernel
(
int
xsize
,
int
ysize
,
int
zsize
,
int
&
threads
,
int
axis
,
bool
forward
,
bool
inputIsReal
);
int
xsize
,
ysize
,
zsize
;
int
xthreads
,
ythreads
,
zthreads
;
bool
packRealAsComplex
;
CudaContext
&
context
;
CUfunction
xkernel
,
ykernel
,
zkernel
;
CUfunction
invxkernel
,
invykernel
,
invzkernel
;
CUfunction
packForwardKernel
,
unpackForwardKernel
,
packBackwardKernel
,
unpackBackwardKernel
;
};
}
// namespace OpenMM
#endif // __OPENMM_CUDAFFT3D_H__
platforms/cuda/include/CudaKernels.h
View file @
97ebc356
...
@@ -30,6 +30,7 @@
...
@@ -30,6 +30,7 @@
#include "CudaPlatform.h"
#include "CudaPlatform.h"
#include "CudaArray.h"
#include "CudaArray.h"
#include "CudaContext.h"
#include "CudaContext.h"
#include "CudaFFT3D.h"
#include "CudaParameterSet.h"
#include "CudaParameterSet.h"
#include "CudaSort.h"
#include "CudaSort.h"
#include "openmm/kernels.h"
#include "openmm/kernels.h"
...
@@ -588,7 +589,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
...
@@ -588,7 +589,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
public:
CudaCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CudaContext
&
cu
,
const
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
CudaCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CudaContext
&
cu
,
const
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
cu
(
cu
),
hasInitializedFFT
(
false
),
sigmaEpsilon
(
NULL
),
exceptionParams
(
NULL
),
cosSinSums
(
NULL
),
directPmeGrid
(
NULL
),
reciprocalPmeGrid
(
NULL
),
cu
(
cu
),
hasInitializedFFT
(
false
),
sigmaEpsilon
(
NULL
),
exceptionParams
(
NULL
),
cosSinSums
(
NULL
),
directPmeGrid
(
NULL
),
reciprocalPmeGrid
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
sort
(
NULL
),
pmeio
(
NULL
)
{
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
sort
(
NULL
),
fft
(
NULL
),
pmeio
(
NULL
)
{
}
}
~
CudaCalcNonbondedForceKernel
();
~
CudaCalcNonbondedForceKernel
();
/**
/**
...
@@ -649,6 +650,7 @@ private:
...
@@ -649,6 +650,7 @@ private:
PmeIO
*
pmeio
;
PmeIO
*
pmeio
;
CUstream
pmeStream
;
CUstream
pmeStream
;
CUevent
pmeSyncEvent
;
CUevent
pmeSyncEvent
;
CudaFFT3D
*
fft
;
cufftHandle
fftForward
;
cufftHandle
fftForward
;
cufftHandle
fftBackward
;
cufftHandle
fftBackward
;
CUfunction
ewaldSumsKernel
;
CUfunction
ewaldSumsKernel
;
...
@@ -663,7 +665,7 @@ private:
...
@@ -663,7 +665,7 @@ private:
std
::
vector
<
std
::
pair
<
int
,
int
>
>
exceptionAtoms
;
std
::
vector
<
std
::
pair
<
int
,
int
>
>
exceptionAtoms
;
double
ewaldSelfEnergy
,
dispersionCoefficient
,
alpha
;
double
ewaldSelfEnergy
,
dispersionCoefficient
,
alpha
;
int
interpolateForceThreads
;
int
interpolateForceThreads
;
bool
hasCoulomb
,
hasLJ
,
usePmeStream
;
bool
hasCoulomb
,
hasLJ
,
usePmeStream
,
useCudaFFT
;
static
const
int
PmeOrder
=
5
;
static
const
int
PmeOrder
=
5
;
};
};
...
...
platforms/cuda/src/CudaFFT3D.cpp
0 → 100644
View file @
97ebc356
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2015 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaFFT3D.h"
#include "CudaContext.h"
#include "CudaKernelSources.h"
#include "SimTKOpenMMRealType.h"
#include <map>
#include <sstream>
#include <string>
using
namespace
OpenMM
;
using
namespace
std
;
CudaFFT3D
::
CudaFFT3D
(
CudaContext
&
context
,
int
xsize
,
int
ysize
,
int
zsize
,
bool
realToComplex
)
:
context
(
context
),
xsize
(
xsize
),
ysize
(
ysize
),
zsize
(
zsize
)
{
packRealAsComplex
=
false
;
int
packedXSize
=
xsize
;
int
packedYSize
=
ysize
;
int
packedZSize
=
zsize
;
if
(
realToComplex
)
{
// If any axis size is even, we can pack the real values into a complex grid that is only half as large.
// Look for an appropriate axis.
packRealAsComplex
=
true
;
int
packedAxis
,
bufferSize
;
if
(
xsize
%
2
==
0
)
{
packedAxis
=
0
;
packedXSize
/=
2
;
bufferSize
=
packedXSize
;
}
else
if
(
ysize
%
2
==
0
)
{
packedAxis
=
1
;
packedYSize
/=
2
;
bufferSize
=
packedYSize
;
}
else
if
(
zsize
%
2
==
0
)
{
packedAxis
=
2
;
packedZSize
/=
2
;
bufferSize
=
packedZSize
;
}
else
packRealAsComplex
=
false
;
if
(
packRealAsComplex
)
{
// Build the kernels for packing and unpacking the data.
map
<
string
,
string
>
defines
;
defines
[
"XSIZE"
]
=
context
.
intToString
(
xsize
);
defines
[
"YSIZE"
]
=
context
.
intToString
(
ysize
);
defines
[
"ZSIZE"
]
=
context
.
intToString
(
zsize
);
defines
[
"PACKED_AXIS"
]
=
context
.
intToString
(
packedAxis
);
defines
[
"PACKED_XSIZE"
]
=
context
.
intToString
(
packedXSize
);
defines
[
"PACKED_YSIZE"
]
=
context
.
intToString
(
packedYSize
);
defines
[
"PACKED_ZSIZE"
]
=
context
.
intToString
(
packedZSize
);
CUmodule
module
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
fftR2C
,
defines
);
packForwardKernel
=
context
.
getKernel
(
module
,
"packForwardData"
);
unpackForwardKernel
=
context
.
getKernel
(
module
,
"unpackForwardData"
);
packBackwardKernel
=
context
.
getKernel
(
module
,
"packBackwardData"
);
unpackBackwardKernel
=
context
.
getKernel
(
module
,
"unpackBackwardData"
);
}
}
bool
inputIsReal
=
(
realToComplex
&&
!
packRealAsComplex
);
zkernel
=
createKernel
(
packedXSize
,
packedYSize
,
packedZSize
,
zthreads
,
0
,
true
,
inputIsReal
);
xkernel
=
createKernel
(
packedYSize
,
packedZSize
,
packedXSize
,
xthreads
,
1
,
true
,
inputIsReal
);
ykernel
=
createKernel
(
packedZSize
,
packedXSize
,
packedYSize
,
ythreads
,
2
,
true
,
inputIsReal
);
invzkernel
=
createKernel
(
packedXSize
,
packedYSize
,
packedZSize
,
zthreads
,
0
,
false
,
inputIsReal
);
invxkernel
=
createKernel
(
packedYSize
,
packedZSize
,
packedXSize
,
xthreads
,
1
,
false
,
inputIsReal
);
invykernel
=
createKernel
(
packedZSize
,
packedXSize
,
packedYSize
,
ythreads
,
2
,
false
,
inputIsReal
);
}
void
CudaFFT3D
::
execFFT
(
CudaArray
&
in
,
CudaArray
&
out
,
bool
forward
)
{
CUfunction
kernel1
=
(
forward
?
zkernel
:
invzkernel
);
CUfunction
kernel2
=
(
forward
?
xkernel
:
invxkernel
);
CUfunction
kernel3
=
(
forward
?
ykernel
:
invykernel
);
void
*
args1
[]
=
{
&
in
.
getDevicePointer
(),
&
out
.
getDevicePointer
()};
void
*
args2
[]
=
{
&
out
.
getDevicePointer
(),
&
in
.
getDevicePointer
()};
if
(
packRealAsComplex
)
{
CUfunction
packKernel
=
(
forward
?
packForwardKernel
:
packBackwardKernel
);
CUfunction
unpackKernel
=
(
forward
?
unpackForwardKernel
:
unpackBackwardKernel
);
int
gridSize
=
xsize
*
ysize
*
zsize
/
2
;
// Pack the data into a half sized grid.
context
.
executeKernel
(
packKernel
,
args1
,
gridSize
,
128
);
// Perform the FFT.
context
.
executeKernel
(
kernel1
,
args2
,
gridSize
,
zthreads
);
context
.
executeKernel
(
kernel2
,
args1
,
gridSize
,
xthreads
);
context
.
executeKernel
(
kernel3
,
args2
,
gridSize
,
ythreads
);
// Unpack the data.
context
.
executeKernel
(
unpackKernel
,
args1
,
gridSize
,
128
);
}
else
{
context
.
executeKernel
(
kernel1
,
args1
,
xsize
*
ysize
*
zsize
,
zthreads
);
context
.
executeKernel
(
kernel2
,
args2
,
xsize
*
ysize
*
zsize
,
xthreads
);
context
.
executeKernel
(
kernel3
,
args1
,
xsize
*
ysize
*
zsize
,
ythreads
);
}
}
int
CudaFFT3D
::
findLegalDimension
(
int
minimum
)
{
if
(
minimum
<
1
)
return
1
;
while
(
true
)
{
// Attempt to factor the current value.
int
unfactored
=
minimum
;
for
(
int
factor
=
2
;
factor
<
8
;
factor
++
)
{
while
(
unfactored
>
1
&&
unfactored
%
factor
==
0
)
unfactored
/=
factor
;
}
if
(
unfactored
==
1
)
return
minimum
;
minimum
++
;
}
}
static
int
getSmallestRadix
(
int
size
)
{
int
minRadix
=
1
;
int
unfactored
=
size
;
while
(
unfactored
%
7
==
0
)
{
minRadix
=
7
;
unfactored
/=
7
;
}
while
(
unfactored
%
5
==
0
)
{
minRadix
=
5
;
unfactored
/=
5
;
}
while
(
unfactored
%
4
==
0
)
{
minRadix
=
4
;
unfactored
/=
4
;
}
while
(
unfactored
%
3
==
0
)
{
minRadix
=
3
;
unfactored
/=
3
;
}
while
(
unfactored
%
2
==
0
)
{
minRadix
=
2
;
unfactored
/=
2
;
}
return
minRadix
;
}
CUfunction
CudaFFT3D
::
createKernel
(
int
xsize
,
int
ysize
,
int
zsize
,
int
&
threads
,
int
axis
,
bool
forward
,
bool
inputIsReal
)
{
int
maxThreads
=
256
;
// while (maxThreads > 128 && maxThreads-64 >= zsize)
// maxThreads -= 64;
int
threadsPerBlock
=
zsize
/
getSmallestRadix
(
zsize
);
stringstream
source
;
int
blocksPerGroup
=
max
(
1
,
maxThreads
/
threadsPerBlock
);
int
stage
=
0
;
int
L
=
zsize
;
int
m
=
1
;
// Factor zsize, generating an appropriate block of code for each factor.
while
(
L
>
1
)
{
int
input
=
stage
%
2
;
int
output
=
1
-
input
;
int
radix
;
if
(
L
%
7
==
0
)
radix
=
7
;
else
if
(
L
%
5
==
0
)
radix
=
5
;
else
if
(
L
%
4
==
0
)
radix
=
4
;
else
if
(
L
%
3
==
0
)
radix
=
3
;
else
if
(
L
%
2
==
0
)
radix
=
2
;
else
throw
OpenMMException
(
"Illegal size for FFT: "
+
context
.
intToString
(
zsize
));
source
<<
"{
\n
"
;
L
=
L
/
radix
;
source
<<
"// Pass "
<<
(
stage
+
1
)
<<
" (radix "
<<
radix
<<
")
\n
"
;
if
(
L
*
m
<
threadsPerBlock
)
source
<<
"if (threadIdx.x < "
<<
(
blocksPerGroup
*
L
*
m
)
<<
") {
\n
"
;
else
source
<<
"{
\n
"
;
source
<<
"int block = threadIdx.x/"
<<
(
L
*
m
)
<<
";
\n
"
;
source
<<
"int i = threadIdx.x-block*"
<<
(
L
*
m
)
<<
";
\n
"
;
source
<<
"int base = i+block*"
<<
zsize
<<
";
\n
"
;
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
if
(
radix
==
7
)
{
source
<<
"real2 c0 = data"
<<
input
<<
"[base];
\n
"
;
source
<<
"real2 c1 = data"
<<
input
<<
"[base+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c2 = data"
<<
input
<<
"[base+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c3 = data"
<<
input
<<
"[base+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c4 = data"
<<
input
<<
"[base+"
<<
(
4
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c5 = data"
<<
input
<<
"[base+"
<<
(
5
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c6 = data"
<<
input
<<
"[base+"
<<
(
6
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 d0 = c1+c6;
\n
"
;
source
<<
"real2 d1 = c1-c6;
\n
"
;
source
<<
"real2 d2 = c2+c5;
\n
"
;
source
<<
"real2 d3 = c2-c5;
\n
"
;
source
<<
"real2 d4 = c4+c3;
\n
"
;
source
<<
"real2 d5 = c4-c3;
\n
"
;
source
<<
"real2 d6 = d2+d0;
\n
"
;
source
<<
"real2 d7 = d5+d3;
\n
"
;
source
<<
"real2 b0 = c0+d6+d4;
\n
"
;
source
<<
"real2 b1 = "
<<
context
.
doubleToString
((
cos
(
2
*
M_PI
/
7
)
+
cos
(
4
*
M_PI
/
7
)
+
cos
(
6
*
M_PI
/
7
))
/
3
-
1
)
<<
"*(d6+d4);
\n
"
;
source
<<
"real2 b2 = "
<<
context
.
doubleToString
((
2
*
cos
(
2
*
M_PI
/
7
)
-
cos
(
4
*
M_PI
/
7
)
-
cos
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d0-d4);
\n
"
;
source
<<
"real2 b3 = "
<<
context
.
doubleToString
((
cos
(
2
*
M_PI
/
7
)
-
2
*
cos
(
4
*
M_PI
/
7
)
+
cos
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d4-d2);
\n
"
;
source
<<
"real2 b4 = "
<<
context
.
doubleToString
((
cos
(
2
*
M_PI
/
7
)
+
cos
(
4
*
M_PI
/
7
)
-
2
*
cos
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d2-d0);
\n
"
;
source
<<
"real2 b5 = -(SIGN)*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
+
sin
(
4
*
M_PI
/
7
)
-
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d7+d1);
\n
"
;
source
<<
"real2 b6 = -(SIGN)*"
<<
context
.
doubleToString
((
2
*
sin
(
2
*
M_PI
/
7
)
-
sin
(
4
*
M_PI
/
7
)
+
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d1-d5);
\n
"
;
source
<<
"real2 b7 = -(SIGN)*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
-
2
*
sin
(
4
*
M_PI
/
7
)
-
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d5-d3);
\n
"
;
source
<<
"real2 b8 = -(SIGN)*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
+
sin
(
4
*
M_PI
/
7
)
+
2
*
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d3-d1);
\n
"
;
source
<<
"real2 t0 = b0+b1;
\n
"
;
source
<<
"real2 t1 = b2+b3;
\n
"
;
source
<<
"real2 t2 = b4-b3;
\n
"
;
source
<<
"real2 t3 = -b2-b4;
\n
"
;
source
<<
"real2 t4 = b6+b7;
\n
"
;
source
<<
"real2 t5 = b8-b7;
\n
"
;
source
<<
"real2 t6 = -b8-b6;
\n
"
;
source
<<
"real2 t7 = t0+t1;
\n
"
;
source
<<
"real2 t8 = t0+t2;
\n
"
;
source
<<
"real2 t9 = t0+t3;
\n
"
;
source
<<
"real2 t10 = make_real2(t4.y+b5.y, -(t4.x+b5.x));
\n
"
;
source
<<
"real2 t11 = make_real2(t5.y+b5.y, -(t5.x+b5.x));
\n
"
;
source
<<
"real2 t12 = make_real2(t6.y+b5.y, -(t6.x+b5.x));
\n
"
;
source
<<
"data"
<<
output
<<
"[base+6*j*"
<<
m
<<
"] = b0;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(6*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
7
*
L
)
<<
"], t7-t10);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(6*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
7
*
L
)
<<
"], t9-t12);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(6*j+3)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
3
*
zsize
)
<<
"/"
<<
(
7
*
L
)
<<
"], t8+t11);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(6*j+4)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
4
*
zsize
)
<<
"/"
<<
(
7
*
L
)
<<
"], t8-t11);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(6*j+5)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
5
*
zsize
)
<<
"/"
<<
(
7
*
L
)
<<
"], t9+t12);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(6*j+6)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
6
*
zsize
)
<<
"/"
<<
(
7
*
L
)
<<
"], t7+t10);
\n
"
;
}
else
if
(
radix
==
5
)
{
source
<<
"real2 c0 = data"
<<
input
<<
"[base];
\n
"
;
source
<<
"real2 c1 = data"
<<
input
<<
"[base+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c2 = data"
<<
input
<<
"[base+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c3 = data"
<<
input
<<
"[base+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c4 = data"
<<
input
<<
"[base+"
<<
(
4
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 d0 = c1+c4;
\n
"
;
source
<<
"real2 d1 = c2+c3;
\n
"
;
source
<<
"real2 d2 = "
<<
context
.
doubleToString
(
sin
(
0.4
*
M_PI
))
<<
"*(c1-c4);
\n
"
;
source
<<
"real2 d3 = "
<<
context
.
doubleToString
(
sin
(
0.4
*
M_PI
))
<<
"*(c2-c3);
\n
"
;
source
<<
"real2 d4 = d0+d1;
\n
"
;
source
<<
"real2 d5 = "
<<
context
.
doubleToString
(
0.25
*
sqrt
(
5.0
))
<<
"*(d0-d1);
\n
"
;
source
<<
"real2 d6 = c0-0.25f*d4;
\n
"
;
source
<<
"real2 d7 = d6+d5;
\n
"
;
source
<<
"real2 d8 = d6-d5;
\n
"
;
string
coeff
=
context
.
doubleToString
(
sin
(
0.2
*
M_PI
)
/
sin
(
0.4
*
M_PI
));
source
<<
"real2 d9 = (SIGN)*make_real2(d2.y+"
<<
coeff
<<
"*d3.y, -d2.x-"
<<
coeff
<<
"*d3.x);
\n
"
;
source
<<
"real2 d10 = (SIGN)*make_real2("
<<
coeff
<<
"*d2.y-d3.y, d3.x-"
<<
coeff
<<
"*d2.x);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+4*j*"
<<
m
<<
"] = c0+d4;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(4*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
5
*
L
)
<<
"], d7+d9);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(4*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
5
*
L
)
<<
"], d8+d10);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(4*j+3)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
3
*
zsize
)
<<
"/"
<<
(
5
*
L
)
<<
"], d8-d10);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(4*j+4)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
4
*
zsize
)
<<
"/"
<<
(
5
*
L
)
<<
"], d7-d9);
\n
"
;
}
else
if
(
radix
==
4
)
{
source
<<
"real2 c0 = data"
<<
input
<<
"[base];
\n
"
;
source
<<
"real2 c1 = data"
<<
input
<<
"[base+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c2 = data"
<<
input
<<
"[base+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c3 = data"
<<
input
<<
"[base+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 d0 = c0+c2;
\n
"
;
source
<<
"real2 d1 = c0-c2;
\n
"
;
source
<<
"real2 d2 = c1+c3;
\n
"
;
source
<<
"real2 d3 = (SIGN)*make_real2(c1.y-c3.y, c3.x-c1.x);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+3*j*"
<<
m
<<
"] = d0+d2;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(3*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
4
*
L
)
<<
"], d1+d3);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(3*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
4
*
L
)
<<
"], d0-d2);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(3*j+3)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
3
*
zsize
)
<<
"/"
<<
(
4
*
L
)
<<
"], d1-d3);
\n
"
;
}
else
if
(
radix
==
3
)
{
source
<<
"real2 c0 = data"
<<
input
<<
"[base];
\n
"
;
source
<<
"real2 c1 = data"
<<
input
<<
"[base+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c2 = data"
<<
input
<<
"[base+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 d0 = c1+c2;
\n
"
;
source
<<
"real2 d1 = c0-0.5f*d0;
\n
"
;
source
<<
"real2 d2 = (SIGN)*"
<<
context
.
doubleToString
(
sin
(
M_PI
/
3.0
))
<<
"*make_real2(c1.y-c2.y, c2.x-c1.x);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+2*j*"
<<
m
<<
"] = c0+d0;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(2*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
3
*
L
)
<<
"], d1+d2);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(2*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
3
*
L
)
<<
"], d1-d2);
\n
"
;
}
else
if
(
radix
==
2
)
{
source
<<
"real2 c0 = data"
<<
input
<<
"[base];
\n
"
;
source
<<
"real2 c1 = data"
<<
input
<<
"[base+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"data"
<<
output
<<
"[base+j*"
<<
m
<<
"] = c0+c1;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
2
*
L
)
<<
"], c0-c1);
\n
"
;
}
source
<<
"}
\n
"
;
m
=
m
*
radix
;
source
<<
"__syncthreads();
\n
"
;
source
<<
"}
\n
"
;
++
stage
;
}
// Create the kernel.
bool
outputIsReal
=
(
inputIsReal
&&
axis
==
2
&&
!
forward
);
bool
outputIsPacked
=
(
inputIsReal
&&
axis
==
2
&&
forward
);
string
outputSuffix
=
(
outputIsReal
?
".x"
:
""
);
if
(
outputIsPacked
)
source
<<
"if (index < XSIZE*YSIZE && x < XSIZE/2+1)
\n
"
;
else
source
<<
"if (index < XSIZE*YSIZE)
\n
"
;
source
<<
"for (int i = threadIdx.x-block*THREADS_PER_BLOCK; i < ZSIZE; i += THREADS_PER_BLOCK)
\n
"
;
if
(
outputIsPacked
)
source
<<
"out[y*(ZSIZE*(XSIZE/2+1))+i*(XSIZE/2+1)+x] = data"
<<
(
stage
%
2
)
<<
"[i+block*ZSIZE]"
<<
outputSuffix
<<
";
\n
"
;
else
source
<<
"out[y*(ZSIZE*XSIZE)+i*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[i+block*ZSIZE]"
<<
outputSuffix
<<
";
\n
"
;
map
<
string
,
string
>
replacements
;
replacements
[
"XSIZE"
]
=
context
.
intToString
(
xsize
);
replacements
[
"YSIZE"
]
=
context
.
intToString
(
ysize
);
replacements
[
"ZSIZE"
]
=
context
.
intToString
(
zsize
);
replacements
[
"BLOCKS_PER_GROUP"
]
=
context
.
intToString
(
blocksPerGroup
);
replacements
[
"THREADS_PER_BLOCK"
]
=
context
.
intToString
(
threadsPerBlock
);
replacements
[
"M_PI"
]
=
context
.
doubleToString
(
M_PI
);
replacements
[
"COMPUTE_FFT"
]
=
source
.
str
();
replacements
[
"SIGN"
]
=
(
forward
?
"1"
:
"-1"
);
replacements
[
"INPUT_TYPE"
]
=
(
inputIsReal
&&
axis
==
0
&&
forward
?
"real"
:
"real2"
);
replacements
[
"OUTPUT_TYPE"
]
=
(
outputIsReal
?
"real"
:
"real2"
);
replacements
[
"INPUT_IS_REAL"
]
=
(
inputIsReal
&&
axis
==
0
&&
forward
?
"1"
:
"0"
);
replacements
[
"INPUT_IS_PACKED"
]
=
(
inputIsReal
&&
axis
==
0
&&
!
forward
?
"1"
:
"0"
);
replacements
[
"OUTPUT_IS_PACKED"
]
=
(
outputIsPacked
?
"1"
:
"0"
);
CUmodule
module
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
context
.
replaceStrings
(
CudaKernelSources
::
fft
,
replacements
));
CUfunction
kernel
=
context
.
getKernel
(
module
,
"execFFT"
);
threads
=
blocksPerGroup
*
threadsPerBlock
;
return
kernel
;
}
platforms/cuda/src/CudaKernels.cpp
View file @
97ebc356
...
@@ -1497,11 +1497,15 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
...
@@ -1497,11 +1497,15 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
delete
pmeAtomGridIndex
;
delete
pmeAtomGridIndex
;
if
(
sort
!=
NULL
)
if
(
sort
!=
NULL
)
delete
sort
;
delete
sort
;
if
(
fft
!=
NULL
)
delete
fft
;
if
(
pmeio
!=
NULL
)
if
(
pmeio
!=
NULL
)
delete
pmeio
;
delete
pmeio
;
if
(
hasInitializedFFT
)
{
if
(
hasInitializedFFT
)
{
cufftDestroy
(
fftForward
);
if
(
useCudaFFT
)
{
cufftDestroy
(
fftBackward
);
cufftDestroy
(
fftForward
);
cufftDestroy
(
fftBackward
);
}
if
(
usePmeStream
)
{
if
(
usePmeStream
)
{
cuStreamDestroy
(
pmeStream
);
cuStreamDestroy
(
pmeStream
);
cuEventDestroy
(
pmeSyncEvent
);
cuEventDestroy
(
pmeSyncEvent
);
...
@@ -1509,26 +1513,6 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
...
@@ -1509,26 +1513,6 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
}
}
}
}
/**
* Select a size for an FFT that is a multiple of 2, 3, 5, and 7.
*/
static
int
findFFTDimension
(
int
minimum
)
{
if
(
minimum
<
1
)
return
1
;
while
(
true
)
{
// Attempt to factor the current value.
int
unfactored
=
minimum
;
for
(
int
factor
=
2
;
factor
<
8
;
factor
++
)
{
while
(
unfactored
>
1
&&
unfactored
%
factor
==
0
)
unfactored
/=
factor
;
}
if
(
unfactored
==
1
)
return
minimum
;
minimum
++
;
}
}
void
CudaCalcNonbondedForceKernel
::
initialize
(
const
System
&
system
,
const
NonbondedForce
&
force
)
{
void
CudaCalcNonbondedForceKernel
::
initialize
(
const
System
&
system
,
const
NonbondedForce
&
force
)
{
cu
.
setAsCurrent
();
cu
.
setAsCurrent
();
...
@@ -1643,9 +1627,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1643,9 +1627,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
int
gridSizeX
,
gridSizeY
,
gridSizeZ
;
int
gridSizeX
,
gridSizeY
,
gridSizeZ
;
NonbondedForceImpl
::
calcPMEParameters
(
system
,
force
,
alpha
,
gridSizeX
,
gridSizeY
,
gridSizeZ
);
NonbondedForceImpl
::
calcPMEParameters
(
system
,
force
,
alpha
,
gridSizeX
,
gridSizeY
,
gridSizeZ
);
gridSizeX
=
findFFT
Dimension
(
gridSizeX
);
gridSizeX
=
CudaFFT3D
::
findLegal
Dimension
(
gridSizeX
);
gridSizeY
=
findFFT
Dimension
(
gridSizeY
);
gridSizeY
=
CudaFFT3D
::
findLegal
Dimension
(
gridSizeY
);
gridSizeZ
=
findFFT
Dimension
(
gridSizeZ
);
gridSizeZ
=
CudaFFT3D
::
findLegal
Dimension
(
gridSizeZ
);
defines
[
"EWALD_ALPHA"
]
=
cu
.
doubleToString
(
alpha
);
defines
[
"EWALD_ALPHA"
]
=
cu
.
doubleToString
(
alpha
);
defines
[
"TWO_OVER_SQRT_PI"
]
=
cu
.
doubleToString
(
2.0
/
sqrt
(
M_PI
));
defines
[
"TWO_OVER_SQRT_PI"
]
=
cu
.
doubleToString
(
2.0
/
sqrt
(
M_PI
));
...
@@ -1692,38 +1676,40 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1692,38 +1676,40 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
// Create required data structures.
// Create required data structures.
int
elementSize
=
(
cu
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
int
elementSize
=
(
cu
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
directPmeGrid
=
new
CudaArray
(
cu
,
gridSizeX
*
gridSizeY
*
gridSizeZ
,
cu
.
getComputeCapability
()
>=
2.0
?
2
*
elementSize
:
2
*
sizeof
(
long
long
),
"originalPmeGrid"
);
directPmeGrid
=
new
CudaArray
(
cu
,
gridSizeX
*
gridSizeY
*
gridSizeZ
,
cu
.
getComputeCapability
()
>=
2.0
?
elementSize
:
sizeof
(
long
long
),
"originalPmeGrid"
);
reciprocalPmeGrid
=
new
CudaArray
(
cu
,
gridSizeX
*
gridSizeY
*
gridSizeZ
,
2
*
elementSize
,
"reciprocalPmeGrid"
);
reciprocalPmeGrid
=
new
CudaArray
(
cu
,
gridSizeX
*
gridSizeY
*
(
gridSizeZ
/
2
+
1
),
2
*
elementSize
,
"reciprocalPmeGrid"
);
cu
.
addAutoclearBuffer
(
*
directPmeGrid
);
cu
.
addAutoclearBuffer
(
*
directPmeGrid
);
pmeBsplineModuliX
=
new
CudaArray
(
cu
,
gridSizeX
,
elementSize
,
"pmeBsplineModuliX"
);
pmeBsplineModuliX
=
new
CudaArray
(
cu
,
gridSizeX
,
elementSize
,
"pmeBsplineModuliX"
);
pmeBsplineModuliY
=
new
CudaArray
(
cu
,
gridSizeY
,
elementSize
,
"pmeBsplineModuliY"
);
pmeBsplineModuliY
=
new
CudaArray
(
cu
,
gridSizeY
,
elementSize
,
"pmeBsplineModuliY"
);
pmeBsplineModuliZ
=
new
CudaArray
(
cu
,
gridSizeZ
,
elementSize
,
"pmeBsplineModuliZ"
);
pmeBsplineModuliZ
=
new
CudaArray
(
cu
,
gridSizeZ
,
elementSize
,
"pmeBsplineModuliZ"
);
pmeAtomRange
=
CudaArray
::
create
<
int
>
(
cu
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomRange
=
CudaArray
::
create
<
int
>
(
cu
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomGridIndex
=
CudaArray
::
create
<
int2
>
(
cu
,
numParticles
,
"pmeAtomGridIndex"
);
pmeAtomGridIndex
=
CudaArray
::
create
<
int2
>
(
cu
,
numParticles
,
"pmeAtomGridIndex"
);
sort
=
new
CudaSort
(
cu
,
new
SortTrait
(),
cu
.
getNumAtoms
());
sort
=
new
CudaSort
(
cu
,
new
SortTrait
(),
cu
.
getNumAtoms
());
useCudaFFT
=
false
;
// We might switch back in the future, once Nvidia has all their bugs worked out
cufftResult
result
=
cufftPlan3d
(
&
fftForward
,
gridSizeX
,
gridSizeY
,
gridSizeZ
,
cu
.
getUseDoublePrecision
()
?
CUFFT_D2Z
:
CUFFT_R2C
);
if
(
useCudaFFT
)
{
if
(
result
!=
CUFFT_SUCCESS
)
cufftResult
result
=
cufftPlan3d
(
&
fftForward
,
gridSizeX
,
gridSizeY
,
gridSizeZ
,
cu
.
getUseDoublePrecision
()
?
CUFFT_D2Z
:
CUFFT_R2C
);
throw
OpenMMException
(
"Error initializing FFT: "
+
cu
.
intToString
(
result
));
if
(
result
!=
CUFFT_SUCCESS
)
result
=
cufftPlan3d
(
&
fftBackward
,
gridSizeX
,
gridSizeY
,
gridSizeZ
,
cu
.
getUseDoublePrecision
()
?
CUFFT_Z2D
:
CUFFT_C2R
);
throw
OpenMMException
(
"Error initializing FFT: "
+
cu
.
intToString
(
result
));
if
(
result
!=
CUFFT_SUCCESS
)
result
=
cufftPlan3d
(
&
fftBackward
,
gridSizeX
,
gridSizeY
,
gridSizeZ
,
cu
.
getUseDoublePrecision
()
?
CUFFT_Z2D
:
CUFFT_C2R
);
throw
OpenMMException
(
"Error initializing FFT: "
+
cu
.
intToString
(
result
));
if
(
result
!=
CUFFT_SUCCESS
)
throw
OpenMMException
(
"Error initializing FFT: "
+
cu
.
intToString
(
result
));
cufftSetCompatibilityMode
(
fftForward
,
CUFFT_COMPATIBILITY_NATIVE
);
cufftSetCompatibilityMode
(
fftForward
,
CUFFT_COMPATIBILITY_NATIVE
);
cufftSetCompatibilityMode
(
fftBackward
,
CUFFT_COMPATIBILITY_NATIVE
);
cufftSetCompatibilityMode
(
fftBackward
,
CUFFT_COMPATIBILITY_NATIVE
);
}
else
fft
=
new
CudaFFT3D
(
cu
,
gridSizeX
,
gridSizeY
,
gridSizeZ
,
true
);
// Prepare for doing PME on its own stream.
// Prepare for doing PME on its own stream.
int
cufftVersion
;
char
deviceName
[
100
]
;
cu
fftGetVersion
(
&
cufftVersion
);
cu
DeviceGetName
(
deviceName
,
100
,
cu
.
getDevice
()
);
usePmeStream
=
(
cu
.
getComputeCapability
()
<
5.0
&&
numParticles
<
130000
&&
cufftVersion
>=
6000
&&
cufftVersion
!=
7000
);
// Workarounds for various CUDA bugs
usePmeStream
=
(
string
(
deviceName
)
!=
"GeForce GTX 980"
);
// Using a separate stream is slower on GTX 980
if
(
usePmeStream
)
{
if
(
usePmeStream
)
{
cuStreamCreate
(
&
pmeStream
,
CU_STREAM_NON_BLOCKING
);
cuStreamCreate
(
&
pmeStream
,
CU_STREAM_NON_BLOCKING
);
cufftSetStream
(
fftForward
,
pmeStream
);
if
(
useCudaFFT
)
{
cufftSetStream
(
fftBackward
,
pmeStream
);
cufftSetStream
(
fftForward
,
pmeStream
);
cufftSetStream
(
fftBackward
,
pmeStream
);
}
CHECK_RESULT
(
cuEventCreate
(
&
pmeSyncEvent
,
CU_EVENT_DISABLE_TIMING
),
"Error creating event for NonbondedForce"
);
CHECK_RESULT
(
cuEventCreate
(
&
pmeSyncEvent
,
CU_EVENT_DISABLE_TIMING
),
"Error creating event for NonbondedForce"
);
int
recipForceGroup
=
force
.
getReciprocalSpaceForceGroup
();
int
recipForceGroup
=
force
.
getReciprocalSpaceForceGroup
();
if
(
recipForceGroup
<
0
)
if
(
recipForceGroup
<
0
)
...
@@ -1893,10 +1879,15 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1893,10 +1879,15 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu
.
executeKernel
(
pmeFinishSpreadChargeKernel
,
finishSpreadArgs
,
directPmeGrid
->
getSize
());
cu
.
executeKernel
(
pmeFinishSpreadChargeKernel
,
finishSpreadArgs
,
directPmeGrid
->
getSize
());
}
}
if
(
cu
.
getUseDoublePrecision
())
if
(
useCudaFFT
)
{
cufftExecD2Z
(
fftForward
,
(
double
*
)
directPmeGrid
->
getDevicePointer
(),
(
double2
*
)
reciprocalPmeGrid
->
getDevicePointer
());
if
(
cu
.
getUseDoublePrecision
())
else
cufftExecD2Z
(
fftForward
,
(
double
*
)
directPmeGrid
->
getDevicePointer
(),
(
double2
*
)
reciprocalPmeGrid
->
getDevicePointer
());
cufftExecR2C
(
fftForward
,
(
float
*
)
directPmeGrid
->
getDevicePointer
(),
(
float2
*
)
reciprocalPmeGrid
->
getDevicePointer
());
else
cufftExecR2C
(
fftForward
,
(
float
*
)
directPmeGrid
->
getDevicePointer
(),
(
float2
*
)
reciprocalPmeGrid
->
getDevicePointer
());
}
else
{
fft
->
execFFT
(
*
directPmeGrid
,
*
reciprocalPmeGrid
,
true
);
}
if
(
includeEnergy
)
{
if
(
includeEnergy
)
{
void
*
computeEnergyArgs
[]
=
{
&
reciprocalPmeGrid
->
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
void
*
computeEnergyArgs
[]
=
{
&
reciprocalPmeGrid
->
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
...
@@ -1910,11 +1901,15 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1910,11 +1901,15 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
]};
cu
.
executeKernel
(
pmeConvolutionKernel
,
convolutionArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeConvolutionKernel
,
convolutionArgs
,
cu
.
getNumAtoms
());
if
(
cu
.
getUseDoublePrecision
())
if
(
useCudaFFT
)
{
cufftExecZ2D
(
fftBackward
,
(
double2
*
)
reciprocalPmeGrid
->
getDevicePointer
(),
(
double
*
)
directPmeGrid
->
getDevicePointer
());
if
(
cu
.
getUseDoublePrecision
())
else
cufftExecZ2D
(
fftBackward
,
(
double2
*
)
reciprocalPmeGrid
->
getDevicePointer
(),
(
double
*
)
directPmeGrid
->
getDevicePointer
());
cufftExecC2R
(
fftBackward
,
(
float2
*
)
reciprocalPmeGrid
->
getDevicePointer
(),
(
float
*
)
directPmeGrid
->
getDevicePointer
());
else
cufftExecC2R
(
fftBackward
,
(
float2
*
)
reciprocalPmeGrid
->
getDevicePointer
(),
(
float
*
)
directPmeGrid
->
getDevicePointer
());
}
else
{
fft
->
execFFT
(
*
reciprocalPmeGrid
,
*
directPmeGrid
,
false
);
}
void
*
interpolateArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
(),
&
directPmeGrid
->
getDevicePointer
(),
void
*
interpolateArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
(),
&
directPmeGrid
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
],
&
pmeAtomGridIndex
->
getDevicePointer
()};
cu
.
getPeriodicBoxSizePointer
(),
recipBoxVectorPointer
[
0
],
recipBoxVectorPointer
[
1
],
recipBoxVectorPointer
[
2
],
&
pmeAtomGridIndex
->
getDevicePointer
()};
...
...
platforms/cuda/src/kernels/fft.cu
0 → 100644
View file @
97ebc356
static
__inline__
__device__
real2
multiplyComplex
(
real2
c1
,
real2
c2
)
{
return
make_real2
(
c1
.
x
*
c2
.
x
-
c1
.
y
*
c2
.
y
,
c1
.
x
*
c2
.
y
+
c1
.
y
*
c2
.
x
);
}
/**
* Load a value from the half-complex grid produces by a real-to-complex transform.
*/
static
__inline__
__device__
real2
loadComplexValue
(
const
real2
*
__restrict__
in
,
int
x
,
int
y
,
int
z
)
{
const
int
inputZSize
=
ZSIZE
/
2
+
1
;
if
(
z
<
inputZSize
)
return
in
[
x
*
YSIZE
*
inputZSize
+
y
*
inputZSize
+
z
];
int
xp
=
(
x
==
0
?
0
:
XSIZE
-
x
);
int
yp
=
(
y
==
0
?
0
:
YSIZE
-
y
);
real2
value
=
in
[
xp
*
YSIZE
*
inputZSize
+
yp
*
inputZSize
+
(
ZSIZE
-
z
)];
return
make_real2
(
value
.
x
,
-
value
.
y
);
}
/**
* Perform a 1D FFT on each row along one axis.
*/
extern
"C"
__global__
void
execFFT
(
const
INPUT_TYPE
*
__restrict__
in
,
OUTPUT_TYPE
*
__restrict__
out
)
{
__shared__
real2
w
[
ZSIZE
];
__shared__
real2
data0
[
BLOCKS_PER_GROUP
*
ZSIZE
];
__shared__
real2
data1
[
BLOCKS_PER_GROUP
*
ZSIZE
];
for
(
int
i
=
threadIdx
.
x
;
i
<
ZSIZE
;
i
+=
blockDim
.
x
)
w
[
i
]
=
make_real2
(
cos
(
-
(
SIGN
)
*
i
*
2
*
M_PI
/
ZSIZE
),
sin
(
-
(
SIGN
)
*
i
*
2
*
M_PI
/
ZSIZE
));
__syncthreads
();
const
int
block
=
threadIdx
.
x
/
THREADS_PER_BLOCK
;
for
(
int
baseIndex
=
blockIdx
.
x
*
BLOCKS_PER_GROUP
;
baseIndex
<
XSIZE
*
YSIZE
;
baseIndex
+=
gridDim
.
x
*
BLOCKS_PER_GROUP
)
{
int
index
=
baseIndex
+
block
;
int
x
=
index
/
YSIZE
;
int
y
=
index
-
x
*
YSIZE
;
#if OUTPUT_IS_PACKED
if
(
x
<
XSIZE
/
2
+
1
)
{
#endif
if
(
index
<
XSIZE
*
YSIZE
)
for
(
int
i
=
threadIdx
.
x
-
block
*
THREADS_PER_BLOCK
;
i
<
ZSIZE
;
i
+=
THREADS_PER_BLOCK
)
#if INPUT_IS_REAL
data0
[
i
+
block
*
ZSIZE
]
=
make_real2
(
in
[
x
*
(
YSIZE
*
ZSIZE
)
+
y
*
ZSIZE
+
i
],
0
);
#elif INPUT_IS_PACKED
data0
[
i
+
block
*
ZSIZE
]
=
loadComplexValue
(
in
,
x
,
y
,
i
);
#else
data0
[
i
+
block
*
ZSIZE
]
=
in
[
x
*
(
YSIZE
*
ZSIZE
)
+
y
*
ZSIZE
+
i
];
#endif
#if OUTPUT_IS_PACKED
}
#endif
__syncthreads
();
COMPUTE_FFT
}
}
platforms/cuda/src/kernels/fftR2C.cu
0 → 100644
View file @
97ebc356
/**
* Combine the two halves of a real grid into a complex grid that is half as large.
*/
extern
"C"
__global__
void
packForwardData
(
const
real
*
__restrict__
in
,
real2
*
__restrict__
out
)
{
const
int
gridSize
=
PACKED_XSIZE
*
PACKED_YSIZE
*
PACKED_ZSIZE
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
x
=
index
/
(
PACKED_YSIZE
*
PACKED_ZSIZE
);
int
remainder
=
index
-
x
*
(
PACKED_YSIZE
*
PACKED_ZSIZE
);
int
y
=
remainder
/
PACKED_ZSIZE
;
int
z
=
remainder
-
y
*
PACKED_ZSIZE
;
#if PACKED_AXIS == 0
real2
value
=
make_real2
(
in
[
2
*
x
*
YSIZE
*
ZSIZE
+
y
*
ZSIZE
+
z
],
in
[(
2
*
x
+
1
)
*
YSIZE
*
ZSIZE
+
y
*
ZSIZE
+
z
]);
#elif PACKED_AXIS == 1
real2
value
=
make_real2
(
in
[
x
*
YSIZE
*
ZSIZE
+
2
*
y
*
ZSIZE
+
z
],
in
[
x
*
YSIZE
*
ZSIZE
+
(
2
*
y
+
1
)
*
ZSIZE
+
z
]);
#else
real2
value
=
make_real2
(
in
[
x
*
YSIZE
*
ZSIZE
+
y
*
ZSIZE
+
2
*
z
],
in
[
x
*
YSIZE
*
ZSIZE
+
y
*
ZSIZE
+
(
2
*
z
+
1
)]);
#endif
out
[
index
]
=
value
;
}
}
/**
* Split the transformed data back into a full sized, symmetric grid.
*/
extern
"C"
__global__
void
unpackForwardData
(
const
real2
*
__restrict__
in
,
real2
*
__restrict__
out
)
{
// Compute the phase factors.
#if PACKED_AXIS == 0
__shared__
real2
w
[
PACKED_XSIZE
];
for
(
int
i
=
threadIdx
.
x
;
i
<
PACKED_XSIZE
;
i
+=
blockDim
.
x
)
w
[
i
]
=
make_real2
(
sin
(
i
*
2
*
M_PI
/
XSIZE
),
cos
(
i
*
2
*
M_PI
/
XSIZE
));
#elif PACKED_AXIS == 1
__shared__
real2
w
[
PACKED_YSIZE
];
for
(
int
i
=
threadIdx
.
x
;
i
<
PACKED_YSIZE
;
i
+=
blockDim
.
x
)
w
[
i
]
=
make_real2
(
sin
(
i
*
2
*
M_PI
/
YSIZE
),
cos
(
i
*
2
*
M_PI
/
YSIZE
));
#else
__shared__
real2
w
[
PACKED_ZSIZE
];
for
(
int
i
=
threadIdx
.
x
;
i
<
PACKED_ZSIZE
;
i
+=
blockDim
.
x
)
w
[
i
]
=
make_real2
(
sin
(
i
*
2
*
M_PI
/
ZSIZE
),
cos
(
i
*
2
*
M_PI
/
ZSIZE
));
#endif
__syncthreads
();
// Transform the data.
const
int
gridSize
=
PACKED_XSIZE
*
PACKED_YSIZE
*
PACKED_ZSIZE
;
const
int
outputZSize
=
ZSIZE
/
2
+
1
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
x
=
index
/
(
PACKED_YSIZE
*
PACKED_ZSIZE
);
int
remainder
=
index
-
x
*
(
PACKED_YSIZE
*
PACKED_ZSIZE
);
int
y
=
remainder
/
PACKED_ZSIZE
;
int
z
=
remainder
-
y
*
PACKED_ZSIZE
;
int
xp
=
(
x
==
0
?
0
:
PACKED_XSIZE
-
x
);
int
yp
=
(
y
==
0
?
0
:
PACKED_YSIZE
-
y
);
int
zp
=
(
z
==
0
?
0
:
PACKED_ZSIZE
-
z
);
real2
z1
=
in
[
x
*
PACKED_YSIZE
*
PACKED_ZSIZE
+
y
*
PACKED_ZSIZE
+
z
];
real2
z2
=
in
[
xp
*
PACKED_YSIZE
*
PACKED_ZSIZE
+
yp
*
PACKED_ZSIZE
+
zp
];
#if PACKED_AXIS == 0
real2
wfac
=
w
[
x
];
#elif PACKED_AXIS == 1
real2
wfac
=
w
[
y
];
#else
real2
wfac
=
w
[
z
];
#endif
real2
output
=
make_real2
((
z1
.
x
+
z2
.
x
-
wfac
.
x
*
(
z1
.
x
-
z2
.
x
)
+
wfac
.
y
*
(
z1
.
y
+
z2
.
y
))
/
2
,
(
z1
.
y
-
z2
.
y
-
wfac
.
y
*
(
z1
.
x
-
z2
.
x
)
-
wfac
.
x
*
(
z1
.
y
+
z2
.
y
))
/
2
);
if
(
z
<
outputZSize
)
out
[
x
*
YSIZE
*
outputZSize
+
y
*
outputZSize
+
z
]
=
output
;
xp
=
(
x
==
0
?
0
:
XSIZE
-
x
);
yp
=
(
y
==
0
?
0
:
YSIZE
-
y
);
zp
=
(
z
==
0
?
0
:
ZSIZE
-
z
);
if
(
zp
<
outputZSize
)
{
#if PACKED_AXIS == 0
if
(
x
==
0
)
out
[
PACKED_XSIZE
*
YSIZE
*
outputZSize
+
yp
*
outputZSize
+
zp
]
=
make_real2
((
z1
.
x
-
z1
.
y
+
z2
.
x
-
z2
.
y
)
/
2
,
(
-
z1
.
x
-
z1
.
y
+
z2
.
x
+
z2
.
y
)
/
2
);
#elif PACKED_AXIS == 1
if
(
y
==
0
)
out
[
xp
*
YSIZE
*
outputZSize
+
PACKED_YSIZE
*
outputZSize
+
zp
]
=
make_real2
((
z1
.
x
-
z1
.
y
+
z2
.
x
-
z2
.
y
)
/
2
,
(
-
z1
.
x
-
z1
.
y
+
z2
.
x
+
z2
.
y
)
/
2
);
#else
if
(
z
==
0
)
out
[
xp
*
YSIZE
*
outputZSize
+
yp
*
outputZSize
+
PACKED_ZSIZE
]
=
make_real2
((
z1
.
x
-
z1
.
y
+
z2
.
x
-
z2
.
y
)
/
2
,
(
-
z1
.
x
-
z1
.
y
+
z2
.
x
+
z2
.
y
)
/
2
);
#endif
else
out
[
xp
*
YSIZE
*
outputZSize
+
yp
*
outputZSize
+
zp
]
=
make_real2
(
output
.
x
,
-
output
.
y
);
}
}
}
/**
* Load a value from the half-complex grid produced by a real-to-complex transform.
*/
static
__inline__
__device__
real2
loadComplexValue
(
const
real2
*
__restrict__
in
,
int
x
,
int
y
,
int
z
)
{
const
int
inputZSize
=
ZSIZE
/
2
+
1
;
if
(
z
<
inputZSize
)
return
in
[
x
*
YSIZE
*
inputZSize
+
y
*
inputZSize
+
z
];
int
xp
=
(
x
==
0
?
0
:
XSIZE
-
x
);
int
yp
=
(
y
==
0
?
0
:
YSIZE
-
y
);
real2
value
=
in
[
xp
*
YSIZE
*
inputZSize
+
yp
*
inputZSize
+
(
ZSIZE
-
z
)];
return
make_real2
(
value
.
x
,
-
value
.
y
);
}
/**
* Repack the symmetric complex grid into one half as large in preparation for doing an inverse complex-to-real transform.
*/
extern
"C"
__global__
void
packBackwardData
(
const
real2
*
__restrict__
in
,
real2
*
__restrict__
out
)
{
// Compute the phase factors.
#if PACKED_AXIS == 0
__shared__
real2
w
[
PACKED_XSIZE
];
for
(
int
i
=
threadIdx
.
x
;
i
<
PACKED_XSIZE
;
i
+=
blockDim
.
x
)
w
[
i
]
=
make_real2
(
cos
(
i
*
2
*
M_PI
/
XSIZE
),
sin
(
i
*
2
*
M_PI
/
XSIZE
));
#elif PACKED_AXIS == 1
__shared__
real2
w
[
PACKED_YSIZE
];
for
(
int
i
=
threadIdx
.
x
;
i
<
PACKED_YSIZE
;
i
+=
blockDim
.
x
)
w
[
i
]
=
make_real2
(
cos
(
i
*
2
*
M_PI
/
YSIZE
),
sin
(
i
*
2
*
M_PI
/
YSIZE
));
#else
__shared__
real2
w
[
PACKED_ZSIZE
];
for
(
int
i
=
threadIdx
.
x
;
i
<
PACKED_ZSIZE
;
i
+=
blockDim
.
x
)
w
[
i
]
=
make_real2
(
cos
(
i
*
2
*
M_PI
/
ZSIZE
),
sin
(
i
*
2
*
M_PI
/
ZSIZE
));
#endif
__syncthreads
();
// Transform the data.
const
int
gridSize
=
PACKED_XSIZE
*
PACKED_YSIZE
*
PACKED_ZSIZE
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
x
=
index
/
(
PACKED_YSIZE
*
PACKED_ZSIZE
);
int
remainder
=
index
-
x
*
(
PACKED_YSIZE
*
PACKED_ZSIZE
);
int
y
=
remainder
/
PACKED_ZSIZE
;
int
z
=
remainder
-
y
*
PACKED_ZSIZE
;
int
xp
=
(
x
==
0
?
0
:
PACKED_XSIZE
-
x
);
int
yp
=
(
y
==
0
?
0
:
PACKED_YSIZE
-
y
);
int
zp
=
(
z
==
0
?
0
:
PACKED_ZSIZE
-
z
);
real2
z1
=
loadComplexValue
(
in
,
x
,
y
,
z
);
#if PACKED_AXIS == 0
real2
wfac
=
w
[
x
];
real2
z2
=
loadComplexValue
(
in
,
PACKED_XSIZE
-
x
,
yp
,
zp
);
#elif PACKED_AXIS == 1
real2
wfac
=
w
[
y
];
real2
z2
=
loadComplexValue
(
in
,
xp
,
PACKED_YSIZE
-
y
,
zp
);
#else
real2
wfac
=
w
[
z
];
real2
z2
=
loadComplexValue
(
in
,
xp
,
yp
,
PACKED_ZSIZE
-
z
);
#endif
real2
even
=
make_real2
((
z1
.
x
+
z2
.
x
)
/
2
,
(
z1
.
y
-
z2
.
y
)
/
2
);
real2
odd
=
make_real2
((
z1
.
x
-
z2
.
x
)
/
2
,
(
z1
.
y
+
z2
.
y
)
/
2
);
odd
=
make_real2
(
odd
.
x
*
wfac
.
x
-
odd
.
y
*
wfac
.
y
,
odd
.
y
*
wfac
.
x
+
odd
.
x
*
wfac
.
y
);
out
[
x
*
PACKED_YSIZE
*
PACKED_ZSIZE
+
y
*
PACKED_ZSIZE
+
z
]
=
make_real2
(
even
.
x
-
odd
.
y
,
even
.
y
+
odd
.
x
);
}
}
/**
* Split the data back into a full sized, real grid after an inverse transform.
*/
extern
"C"
__global__
void
unpackBackwardData
(
const
real2
*
__restrict__
in
,
real
*
__restrict__
out
)
{
const
int
gridSize
=
PACKED_XSIZE
*
PACKED_YSIZE
*
PACKED_ZSIZE
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
x
=
index
/
(
PACKED_YSIZE
*
PACKED_ZSIZE
);
int
remainder
=
index
-
x
*
(
PACKED_YSIZE
*
PACKED_ZSIZE
);
int
y
=
remainder
/
PACKED_ZSIZE
;
int
z
=
remainder
-
y
*
PACKED_ZSIZE
;
real2
value
=
2
*
in
[
index
];
#if PACKED_AXIS == 0
out
[
2
*
x
*
YSIZE
*
ZSIZE
+
y
*
ZSIZE
+
z
]
=
value
.
x
;
out
[(
2
*
x
+
1
)
*
YSIZE
*
ZSIZE
+
y
*
ZSIZE
+
z
]
=
value
.
y
;
#elif PACKED_AXIS == 1
out
[
x
*
YSIZE
*
ZSIZE
+
2
*
y
*
ZSIZE
+
z
]
=
value
.
x
;
out
[
x
*
YSIZE
*
ZSIZE
+
(
2
*
y
+
1
)
*
ZSIZE
+
z
]
=
value
.
y
;
#else
out
[
x
*
YSIZE
*
ZSIZE
+
y
*
ZSIZE
+
2
*
z
]
=
value
.
x
;
out
[
x
*
YSIZE
*
ZSIZE
+
y
*
ZSIZE
+
(
2
*
z
+
1
)]
=
value
.
y
;
#endif
}
}
platforms/cuda/tests/TestCudaFFT3D.cpp
0 → 100644
View file @
97ebc356
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-2015 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This tests the CUDA implementation of sorting.
*/
#include "openmm/internal/AssertionUtilities.h"
#include "CudaArray.h"
#include "CudaContext.h"
#include "CudaFFT3D.h"
#include "CudaSort.h"
#include "fftpack.h"
#include "sfmt/SFMT.h"
#include "openmm/System.h"
#include <iostream>
#include <cmath>
#include <set>
using
namespace
OpenMM
;
using
namespace
std
;
static
CudaPlatform
platform
;
template
<
class
Real2
>
void
testTransform
(
bool
realToComplex
,
int
xsize
,
int
ysize
,
int
zsize
)
{
System
system
;
system
.
addParticle
(
0.0
);
CudaPlatform
::
PlatformData
platformData
(
NULL
,
system
,
""
,
"true"
,
platform
.
getPropertyDefaultValue
(
"CudaPrecision"
),
"false"
,
platform
.
getPropertyDefaultValue
(
CudaPlatform
::
CudaCompiler
()),
platform
.
getPropertyDefaultValue
(
CudaPlatform
::
CudaTempDirectory
()),
platform
.
getPropertyDefaultValue
(
CudaPlatform
::
CudaHostCompiler
()));
CudaContext
&
context
=
*
platformData
.
contexts
[
0
];
context
.
initialize
();
OpenMM_SFMT
::
SFMT
sfmt
;
init_gen_rand
(
0
,
sfmt
);
vector
<
Real2
>
original
(
xsize
*
ysize
*
zsize
);
vector
<
t_complex
>
reference
(
original
.
size
());
for
(
int
i
=
0
;
i
<
(
int
)
original
.
size
();
i
++
)
{
Real2
value
;
value
.
x
=
(
float
)
genrand_real2
(
sfmt
);
value
.
y
=
(
float
)
genrand_real2
(
sfmt
);
original
[
i
]
=
value
;
reference
[
i
]
=
t_complex
(
value
.
x
,
value
.
y
);
}
for
(
int
i
=
0
;
i
<
(
int
)
reference
.
size
();
i
++
)
{
if
(
realToComplex
)
reference
[
i
]
=
t_complex
(
i
%
2
==
0
?
original
[
i
/
2
].
x
:
original
[
i
/
2
].
y
,
0
);
else
reference
[
i
]
=
t_complex
(
original
[
i
].
x
,
original
[
i
].
y
);
}
CudaArray
grid1
(
context
,
original
.
size
(),
sizeof
(
Real2
),
"grid1"
);
CudaArray
grid2
(
context
,
original
.
size
(),
sizeof
(
Real2
),
"grid2"
);
grid1
.
upload
(
original
);
CudaFFT3D
fft
(
context
,
xsize
,
ysize
,
zsize
,
realToComplex
);
// Perform a forward FFT, then verify the result is correct.
fft
.
execFFT
(
grid1
,
grid2
,
true
);
vector
<
Real2
>
result
;
grid2
.
download
(
result
);
fftpack_t
plan
;
fftpack_init_3d
(
&
plan
,
xsize
,
ysize
,
zsize
);
fftpack_exec_3d
(
plan
,
FFTPACK_FORWARD
,
&
reference
[
0
],
&
reference
[
0
]);
int
outputZSize
=
(
realToComplex
?
zsize
/
2
+
1
:
zsize
);
for
(
int
x
=
0
;
x
<
xsize
;
x
++
)
for
(
int
y
=
0
;
y
<
ysize
;
y
++
)
for
(
int
z
=
0
;
z
<
outputZSize
;
z
++
)
{
int
index1
=
x
*
ysize
*
zsize
+
y
*
zsize
+
z
;
int
index2
=
x
*
ysize
*
outputZSize
+
y
*
outputZSize
+
z
;
ASSERT_EQUAL_TOL
(
reference
[
index1
].
re
,
result
[
index2
].
x
,
1e-3
);
ASSERT_EQUAL_TOL
(
reference
[
index1
].
im
,
result
[
index2
].
y
,
1e-3
);
}
fftpack_destroy
(
plan
);
// Perform a backward transform and see if we get the original values.
fft
.
execFFT
(
grid2
,
grid1
,
false
);
grid1
.
download
(
result
);
double
scale
=
1.0
/
(
xsize
*
ysize
*
zsize
);
int
valuesToCheck
=
(
realToComplex
?
original
.
size
()
/
2
:
original
.
size
());
for
(
int
i
=
0
;
i
<
valuesToCheck
;
++
i
)
{
ASSERT_EQUAL_TOL
(
original
[
i
].
x
,
scale
*
result
[
i
].
x
,
1e-4
);
ASSERT_EQUAL_TOL
(
original
[
i
].
y
,
scale
*
result
[
i
].
y
,
1e-4
);
}
}
int
main
(
int
argc
,
char
*
argv
[])
{
try
{
if
(
argc
>
1
)
platform
.
setPropertyDefaultValue
(
"CudaPrecision"
,
string
(
argv
[
1
]));
if
(
platform
.
getPropertyDefaultValue
(
"CudaPrecision"
)
==
"double"
)
{
testTransform
<
double2
>
(
false
,
28
,
25
,
30
);
testTransform
<
double2
>
(
true
,
28
,
25
,
25
);
testTransform
<
double2
>
(
true
,
25
,
28
,
25
);
testTransform
<
double2
>
(
true
,
25
,
25
,
28
);
testTransform
<
double2
>
(
true
,
21
,
25
,
27
);
}
else
{
testTransform
<
float2
>
(
false
,
28
,
25
,
30
);
testTransform
<
float2
>
(
true
,
28
,
25
,
25
);
testTransform
<
float2
>
(
true
,
25
,
28
,
25
);
testTransform
<
float2
>
(
true
,
25
,
25
,
28
);
testTransform
<
float2
>
(
true
,
21
,
25
,
27
);
}
}
catch
(
const
exception
&
e
)
{
cout
<<
"exception: "
<<
e
.
what
()
<<
endl
;
return
1
;
}
cout
<<
"Done"
<<
endl
;
return
0
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment