Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
ad62d81e
Commit
ad62d81e
authored
Apr 15, 2015
by
Peter Eastman
Browse files
OpenCL FFT stores only non-redundant elements
parent
2db04610
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
149 additions
and
45 deletions
+149
-45
platforms/opencl/include/OpenCLFFT3D.h
platforms/opencl/include/OpenCLFFT3D.h
+3
-0
platforms/opencl/include/OpenCLKernels.h
platforms/opencl/include/OpenCLKernels.h
+1
-0
platforms/opencl/src/OpenCLFFT3D.cpp
platforms/opencl/src/OpenCLFFT3D.cpp
+7
-1
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+25
-15
platforms/opencl/src/kernels/fft.cl
platforms/opencl/src/kernels/fft.cl
+21
-0
platforms/opencl/src/kernels/fftR2C.cl
platforms/opencl/src/kernels/fftR2C.cl
+30
-13
platforms/opencl/src/kernels/pme.cl
platforms/opencl/src/kernels/pme.cl
+53
-12
platforms/opencl/tests/TestOpenCLFFT.cpp
platforms/opencl/tests/TestOpenCLFFT.cpp
+9
-4
No files found.
platforms/opencl/include/OpenCLFFT3D.h
View file @
ad62d81e
...
@@ -69,6 +69,9 @@ public:
...
@@ -69,6 +69,9 @@ public:
* arrays must be different. Also, the input array is used as workspace, so its contents
* arrays must be different. Also, the input array is used as workspace, so its contents
* are destroyed. This also means that both arrays must be large enough to hold complex values,
* are destroyed. This also means that both arrays must be large enough to hold complex values,
* even when performing a real-to-complex transform.
* even when performing a real-to-complex transform.
* <p>
* When performing a real-to-complex transform, the output data is of size xsize*ysize*(zsize/2+1)
* and contains only the non-redundant elements.
*
*
* @param in the data to transform, ordered such that in[x*ysize*zsize + y*zsize + z] contains element (x, y, z)
* @param in the data to transform, ordered such that in[x*ysize*zsize + y*zsize + z] contains element (x, y, z)
* @param out on exit, this contains the transformed data
* @param out on exit, this contains the transformed data
...
...
platforms/opencl/include/OpenCLKernels.h
View file @
ad62d81e
...
@@ -639,6 +639,7 @@ private:
...
@@ -639,6 +639,7 @@ private:
cl
::
Kernel
pmeSpreadChargeKernel
;
cl
::
Kernel
pmeSpreadChargeKernel
;
cl
::
Kernel
pmeFinishSpreadChargeKernel
;
cl
::
Kernel
pmeFinishSpreadChargeKernel
;
cl
::
Kernel
pmeConvolutionKernel
;
cl
::
Kernel
pmeConvolutionKernel
;
cl
::
Kernel
pmeEvalEnergyKernel
;
cl
::
Kernel
pmeInterpolateForceKernel
;
cl
::
Kernel
pmeInterpolateForceKernel
;
std
::
map
<
std
::
string
,
std
::
string
>
pmeDefines
;
std
::
map
<
std
::
string
,
std
::
string
>
pmeDefines
;
std
::
vector
<
std
::
pair
<
int
,
int
>
>
exceptionAtoms
;
std
::
vector
<
std
::
pair
<
int
,
int
>
>
exceptionAtoms
;
...
...
platforms/opencl/src/OpenCLFFT3D.cpp
View file @
ad62d81e
...
@@ -310,6 +310,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
...
@@ -310,6 +310,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
// Create the kernel.
// Create the kernel.
bool
outputIsReal
=
(
inputIsReal
&&
axis
==
2
&&
!
forward
);
bool
outputIsReal
=
(
inputIsReal
&&
axis
==
2
&&
!
forward
);
bool
outputIsPacked
=
(
inputIsReal
&&
axis
==
2
&&
forward
);
string
outputSuffix
=
(
outputIsReal
?
".x"
:
""
);
string
outputSuffix
=
(
outputIsReal
?
".x"
:
""
);
if
(
loopRequired
)
{
if
(
loopRequired
)
{
source
<<
"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))
\n
"
;
source
<<
"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))
\n
"
;
...
@@ -317,7 +318,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
...
@@ -317,7 +318,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
}
}
else
{
else
{
source
<<
"if (index < XSIZE*YSIZE)
\n
"
;
source
<<
"if (index < XSIZE*YSIZE)
\n
"
;
source
<<
"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[get_local_id(0)]"
<<
outputSuffix
<<
";
\n
"
;
if
(
outputIsPacked
)
source
<<
"out[y*(ZSIZE*(XSIZE/2+1))+(get_local_id(0)%ZSIZE)*(XSIZE/2+1)+x] = data"
<<
(
stage
%
2
)
<<
"[get_local_id(0)]"
<<
outputSuffix
<<
";
\n
"
;
else
source
<<
"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[get_local_id(0)]"
<<
outputSuffix
<<
";
\n
"
;
}
}
map
<
string
,
string
>
replacements
;
map
<
string
,
string
>
replacements
;
replacements
[
"XSIZE"
]
=
context
.
intToString
(
xsize
);
replacements
[
"XSIZE"
]
=
context
.
intToString
(
xsize
);
...
@@ -331,6 +335,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
...
@@ -331,6 +335,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
replacements
[
"INPUT_TYPE"
]
=
(
inputIsReal
&&
axis
==
0
&&
forward
?
"real"
:
"real2"
);
replacements
[
"INPUT_TYPE"
]
=
(
inputIsReal
&&
axis
==
0
&&
forward
?
"real"
:
"real2"
);
replacements
[
"OUTPUT_TYPE"
]
=
(
outputIsReal
?
"real"
:
"real2"
);
replacements
[
"OUTPUT_TYPE"
]
=
(
outputIsReal
?
"real"
:
"real2"
);
replacements
[
"INPUT_IS_REAL"
]
=
(
inputIsReal
&&
axis
==
0
&&
forward
?
"1"
:
"0"
);
replacements
[
"INPUT_IS_REAL"
]
=
(
inputIsReal
&&
axis
==
0
&&
forward
?
"1"
:
"0"
);
replacements
[
"INPUT_IS_PACKED"
]
=
(
inputIsReal
&&
axis
==
0
&&
!
forward
?
"1"
:
"0"
);
replacements
[
"OUTPUT_IS_PACKED"
]
=
(
outputIsPacked
?
"1"
:
"0"
);
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
fft
,
replacements
));
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
fft
,
replacements
));
cl
::
Kernel
kernel
(
program
,
"execFFT"
);
cl
::
Kernel
kernel
(
program
,
"execFFT"
);
threads
=
(
isCPU
?
1
:
blocksPerGroup
*
zsize
);
threads
=
(
isCPU
?
1
:
blocksPerGroup
*
zsize
);
...
...
platforms/opencl/src/OpenCLKernels.cpp
View file @
ad62d81e
...
@@ -1806,6 +1806,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -1806,6 +1806,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeZIndexKernel
=
cl
::
Kernel
(
program
,
"recordZIndex"
);
pmeZIndexKernel
=
cl
::
Kernel
(
program
,
"recordZIndex"
);
pmeSpreadChargeKernel
=
cl
::
Kernel
(
program
,
"gridSpreadCharge"
);
pmeSpreadChargeKernel
=
cl
::
Kernel
(
program
,
"gridSpreadCharge"
);
pmeConvolutionKernel
=
cl
::
Kernel
(
program
,
"reciprocalConvolution"
);
pmeConvolutionKernel
=
cl
::
Kernel
(
program
,
"reciprocalConvolution"
);
pmeEvalEnergyKernel
=
cl
::
Kernel
(
program
,
"gridEvaluateEnergy"
);
pmeInterpolateForceKernel
=
cl
::
Kernel
(
program
,
"gridInterpolateForce"
);
pmeInterpolateForceKernel
=
cl
::
Kernel
(
program
,
"gridInterpolateForce"
);
int
elementSize
=
(
cl
.
getUseDoublePrecision
()
?
sizeof
(
mm_double4
)
:
sizeof
(
mm_float4
));
int
elementSize
=
(
cl
.
getUseDoublePrecision
()
?
sizeof
(
mm_double4
)
:
sizeof
(
mm_float4
));
pmeUpdateBsplinesKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
pmeUpdateBsplinesKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
...
@@ -1826,10 +1827,14 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -1826,10 +1827,14 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeGrid
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeGrid
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
pmeBsplineTheta
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
pmeBsplineTheta
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
pmeGrid2
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
pmeGrid2
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getEnergyBuffer
().
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
pmeBsplineModuliX
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeBsplineModuliX
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeBsplineModuliY
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeBsplineModuliY
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeBsplineModuliZ
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
pmeBsplineModuliZ
->
getDeviceBuffer
());
pmeEvalEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
pmeGrid2
->
getDeviceBuffer
());
pmeEvalEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getEnergyBuffer
().
getDeviceBuffer
());
pmeEvalEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeBsplineModuliX
->
getDeviceBuffer
());
pmeEvalEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeBsplineModuliY
->
getDeviceBuffer
());
pmeEvalEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
pmeBsplineModuliZ
->
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getForceBuffers
().
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getForceBuffers
().
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeGrid
->
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeGrid
->
getDeviceBuffer
());
...
@@ -1861,7 +1866,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -1861,7 +1866,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl
.
executeKernel
(
ewaldForcesKernel
,
cl
.
getNumAtoms
());
cl
.
executeKernel
(
ewaldForcesKernel
,
cl
.
getNumAtoms
());
}
}
if
(
pmeGrid
!=
NULL
&&
includeReciprocal
)
{
if
(
pmeGrid
!=
NULL
&&
includeReciprocal
)
{
if
(
usePmeQueue
)
if
(
usePmeQueue
&&
!
includeEnergy
)
cl
.
setQueue
(
pmeQueue
);
cl
.
setQueue
(
pmeQueue
);
// Invert the periodic box vectors.
// Invert the periodic box vectors.
...
@@ -1936,19 +1941,24 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -1936,19 +1941,24 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
}
}
fft
->
execFFT
(
*
pmeGrid
,
*
pmeGrid2
,
true
);
fft
->
execFFT
(
*
pmeGrid
,
*
pmeGrid2
,
true
);
mm_double4
boxSize
=
cl
.
getPeriodicBoxSizeDouble
();
mm_double4
boxSize
=
cl
.
getPeriodicBoxSizeDouble
();
double
scaleFactor
=
1.0
/
(
M_PI
*
boxSize
.
x
*
boxSize
.
y
*
boxSize
.
z
);
if
(
cl
.
getUseDoublePrecision
())
{
if
(
cl
.
getUseDoublePrecision
())
{
pmeConvolutionKernel
.
setArg
<
mm_double4
>
(
5
,
recipBoxVectors
[
0
]);
pmeConvolutionKernel
.
setArg
<
mm_double4
>
(
4
,
recipBoxVectors
[
0
]);
pmeConvolutionKernel
.
setArg
<
mm_double4
>
(
6
,
recipBoxVectors
[
1
]);
pmeConvolutionKernel
.
setArg
<
mm_double4
>
(
5
,
recipBoxVectors
[
1
]);
pmeConvolutionKernel
.
setArg
<
mm_double4
>
(
7
,
recipBoxVectors
[
2
]);
pmeConvolutionKernel
.
setArg
<
mm_double4
>
(
6
,
recipBoxVectors
[
2
]);
pmeConvolutionKernel
.
setArg
<
cl_double
>
(
8
,
scaleFactor
);
pmeEvalEnergyKernel
.
setArg
<
mm_double4
>
(
5
,
recipBoxVectors
[
0
]);
pmeEvalEnergyKernel
.
setArg
<
mm_double4
>
(
6
,
recipBoxVectors
[
1
]);
pmeEvalEnergyKernel
.
setArg
<
mm_double4
>
(
7
,
recipBoxVectors
[
2
]);
}
}
else
{
else
{
pmeConvolutionKernel
.
setArg
<
mm_float4
>
(
5
,
recipBoxVectorsFloat
[
0
]);
pmeConvolutionKernel
.
setArg
<
mm_float4
>
(
4
,
recipBoxVectorsFloat
[
0
]);
pmeConvolutionKernel
.
setArg
<
mm_float4
>
(
6
,
recipBoxVectorsFloat
[
1
]);
pmeConvolutionKernel
.
setArg
<
mm_float4
>
(
5
,
recipBoxVectorsFloat
[
1
]);
pmeConvolutionKernel
.
setArg
<
mm_float4
>
(
7
,
recipBoxVectorsFloat
[
2
]);
pmeConvolutionKernel
.
setArg
<
mm_float4
>
(
6
,
recipBoxVectorsFloat
[
2
]);
pmeConvolutionKernel
.
setArg
<
cl_float
>
(
8
,
(
float
)
scaleFactor
);
pmeEvalEnergyKernel
.
setArg
<
mm_float4
>
(
5
,
recipBoxVectorsFloat
[
0
]);
}
pmeEvalEnergyKernel
.
setArg
<
mm_float4
>
(
6
,
recipBoxVectorsFloat
[
1
]);
pmeEvalEnergyKernel
.
setArg
<
mm_float4
>
(
7
,
recipBoxVectorsFloat
[
2
]);
}
if
(
includeEnergy
)
cl
.
executeKernel
(
pmeEvalEnergyKernel
,
cl
.
getNumAtoms
());
cl
.
executeKernel
(
pmeConvolutionKernel
,
cl
.
getNumAtoms
());
cl
.
executeKernel
(
pmeConvolutionKernel
,
cl
.
getNumAtoms
());
fft
->
execFFT
(
*
pmeGrid2
,
*
pmeGrid
,
false
);
fft
->
execFFT
(
*
pmeGrid2
,
*
pmeGrid
,
false
);
setPeriodicBoxSizeArg
(
cl
,
pmeInterpolateForceKernel
,
3
);
setPeriodicBoxSizeArg
(
cl
,
pmeInterpolateForceKernel
,
3
);
...
...
platforms/opencl/src/kernels/fft.cl
View file @
ad62d81e
...
@@ -2,6 +2,19 @@ real2 multiplyComplex(real2 c1, real2 c2) {
...
@@ -2,6 +2,19 @@ real2 multiplyComplex(real2 c1, real2 c2) {
return
(
real2
)
(
c1.x*c2.x-c1.y*c2.y,
c1.x*c2.y+c1.y*c2.x
)
;
return
(
real2
)
(
c1.x*c2.x-c1.y*c2.y,
c1.x*c2.y+c1.y*c2.x
)
;
}
}
/**
*
Load
a
value
from
the
half-complex
grid
produces
by
a
real-to-complex
transform.
*/
real2
loadComplexValue
(
__global
const
real2*
restrict
in,
int
x,
int
y,
int
z
)
{
const
int
inputZSize
=
ZSIZE/2+1
;
if
(
z
<
inputZSize
)
return
in[x*YSIZE*inputZSize+y*inputZSize+z]
;
int
xp
=
(
x
==
0
?
0
:
XSIZE-x
)
;
int
yp
=
(
y
==
0
?
0
:
YSIZE-y
)
;
real2
value
=
in[xp*YSIZE*inputZSize+yp*inputZSize+
(
ZSIZE-z
)
]
;
return
(
real2
)
(
value.x,
-value.y
)
;
}
/**
/**
*
Perform
a
1D
FFT
on
each
row
along
one
axis.
*
Perform
a
1D
FFT
on
each
row
along
one
axis.
*/
*/
...
@@ -16,10 +29,16 @@ __kernel void execFFT(__global const INPUT_TYPE* restrict in, __global OUTPUT_TY
...
@@ -16,10 +29,16 @@ __kernel void execFFT(__global const INPUT_TYPE* restrict in, __global OUTPUT_TY
int
index
=
baseIndex+get_local_id
(
0
)
/ZSIZE
;
int
index
=
baseIndex+get_local_id
(
0
)
/ZSIZE
;
int
x
=
index/YSIZE
;
int
x
=
index/YSIZE
;
int
y
=
index-x*YSIZE
;
int
y
=
index-x*YSIZE
;
#
if
OUTPUT_IS_PACKED
if
(
x
>=
XSIZE/2+1
)
continue
;
#
endif
#
if
LOOP_REQUIRED
#
if
LOOP_REQUIRED
for
(
int
z
=
get_local_id
(
0
)
; z < ZSIZE; z += get_local_size(0))
for
(
int
z
=
get_local_id
(
0
)
; z < ZSIZE; z += get_local_size(0))
#
if
INPUT_IS_REAL
#
if
INPUT_IS_REAL
data0[z]
=
(
real2
)
(
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+z],
0
)
;
data0[z]
=
(
real2
)
(
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+z],
0
)
;
#
elif
INPUT_IS_PACKED
data0[z]
=
loadComplexValue
(
in,
x,
y,
z
)
;
#
else
#
else
data0[z]
=
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+z]
;
data0[z]
=
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+z]
;
#
endif
#
endif
...
@@ -27,6 +46,8 @@ __kernel void execFFT(__global const INPUT_TYPE* restrict in, __global OUTPUT_TY
...
@@ -27,6 +46,8 @@ __kernel void execFFT(__global const INPUT_TYPE* restrict in, __global OUTPUT_TY
if
(
index
<
XSIZE*YSIZE
)
if
(
index
<
XSIZE*YSIZE
)
#
if
INPUT_IS_REAL
#
if
INPUT_IS_REAL
data0[get_local_id
(
0
)
]
=
(
real2
)
(
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+get_local_id
(
0
)
%ZSIZE],
0
)
;
data0[get_local_id
(
0
)
]
=
(
real2
)
(
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+get_local_id
(
0
)
%ZSIZE],
0
)
;
#
elif
INPUT_IS_PACKED
data0[get_local_id
(
0
)
]
=
loadComplexValue
(
in,
x,
y,
get_local_id
(
0
)
%ZSIZE
)
;
#
else
#
else
data0[get_local_id
(
0
)
]
=
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+get_local_id
(
0
)
%ZSIZE]
;
data0[get_local_id
(
0
)
]
=
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+get_local_id
(
0
)
%ZSIZE]
;
#
endif
#
endif
...
...
platforms/opencl/src/kernels/fftR2C.cl
View file @
ad62d81e
...
@@ -40,6 +40,7 @@ __kernel void unpackForwardData(__global const real2* restrict in, __global real
...
@@ -40,6 +40,7 @@ __kernel void unpackForwardData(__global const real2* restrict in, __global real
//
Transform
the
data.
//
Transform
the
data.
const
int
gridSize
=
PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE
;
const
int
gridSize
=
PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE
;
const
int
outputZSize
=
ZSIZE/2+1
;
for
(
int
index
=
get_global_id
(
0
)
; index < gridSize; index += get_global_size(0)) {
for
(
int
index
=
get_global_id
(
0
)
; index < gridSize; index += get_global_size(0)) {
int
x
=
index/
(
PACKED_YSIZE*PACKED_ZSIZE
)
;
int
x
=
index/
(
PACKED_YSIZE*PACKED_ZSIZE
)
;
int
remainder
=
index-x*
(
PACKED_YSIZE*PACKED_ZSIZE
)
;
int
remainder
=
index-x*
(
PACKED_YSIZE*PACKED_ZSIZE
)
;
...
@@ -58,25 +59,41 @@ __kernel void unpackForwardData(__global const real2* restrict in, __global real
...
@@ -58,25 +59,41 @@ __kernel void unpackForwardData(__global const real2* restrict in, __global real
real2
wfac
=
w[z]
;
real2
wfac
=
w[z]
;
#
endif
#
endif
real2
output
=
(
real2
)
((
z1.x+z2.x
-
wfac.x*
(
z1.x-z2.x
)
+
wfac.y*
(
z1.y+z2.y
))
/2,
(
z1.y-z2.y
-
wfac.y*
(
z1.x-z2.x
)
-
wfac.x*
(
z1.y+z2.y
))
/2
)
;
real2
output
=
(
real2
)
((
z1.x+z2.x
-
wfac.x*
(
z1.x-z2.x
)
+
wfac.y*
(
z1.y+z2.y
))
/2,
(
z1.y-z2.y
-
wfac.y*
(
z1.x-z2.x
)
-
wfac.x*
(
z1.y+z2.y
))
/2
)
;
out[x*YSIZE*ZSIZE+y*ZSIZE+z]
=
output
;
if
(
z
<
outputZSize
)
out[x*YSIZE*outputZSize+y*outputZSize+z]
=
output
;
xp
=
(
x
==
0
?
0
:
XSIZE-x
)
;
xp
=
(
x
==
0
?
0
:
XSIZE-x
)
;
yp
=
(
y
==
0
?
0
:
YSIZE-y
)
;
yp
=
(
y
==
0
?
0
:
YSIZE-y
)
;
zp
=
(
z
==
0
?
0
:
ZSIZE-z
)
;
zp
=
(
z
==
0
?
0
:
ZSIZE-z
)
;
if
(
zp
<
outputZSize
)
{
#
if
PACKED_AXIS
==
0
#
if
PACKED_AXIS
==
0
if
(
x
==
0
)
if
(
x
==
0
)
out[PACKED_XSIZE*YSIZE*
ZSIZE+yp*ZSIZE
+zp]
=
(
real2
)
((
z1.x-z1.y+z2.x-z2.y
)
/2,
(
-z1.x-z1.y+z2.x+z2.y
)
/2
)
;
out[PACKED_XSIZE*YSIZE*
outputZSize+yp*outputZSize
+zp]
=
(
real2
)
((
z1.x-z1.y+z2.x-z2.y
)
/2,
(
-z1.x-z1.y+z2.x+z2.y
)
/2
)
;
#
elif
PACKED_AXIS
==
1
#
elif
PACKED_AXIS
==
1
if
(
y
==
0
)
if
(
y
==
0
)
out[xp*YSIZE*
ZSIZE
+PACKED_YSIZE*
ZSIZE
+zp]
=
(
real2
)
((
z1.x-z1.y+z2.x-z2.y
)
/2,
(
-z1.x-z1.y+z2.x+z2.y
)
/2
)
;
out[xp*YSIZE*
outputZSize
+PACKED_YSIZE*
outputZSize
+zp]
=
(
real2
)
((
z1.x-z1.y+z2.x-z2.y
)
/2,
(
-z1.x-z1.y+z2.x+z2.y
)
/2
)
;
#
else
#
else
if
(
z
==
0
)
if
(
z
==
0
)
out[xp*YSIZE*
ZSIZE+yp*ZSIZE
+PACKED_ZSIZE]
=
(
real2
)
((
z1.x-z1.y+z2.x-z2.y
)
/2,
(
-z1.x-z1.y+z2.x+z2.y
)
/2
)
;
out[xp*YSIZE*
outputZSize+yp*outputZSize
+PACKED_ZSIZE]
=
(
real2
)
((
z1.x-z1.y+z2.x-z2.y
)
/2,
(
-z1.x-z1.y+z2.x+z2.y
)
/2
)
;
#
endif
#
endif
else
else
out[xp*YSIZE*ZSIZE+yp*ZSIZE+zp]
=
(
real2
)
(
output.x,
-output.y
)
;
out[xp*YSIZE*outputZSize+yp*outputZSize+zp]
=
(
real2
)
(
output.x,
-output.y
)
;
}
}
}
}
}
/**
*
Load
a
value
from
the
half-complex
grid
produced
by
a
real-to-complex
transform.
*/
real2
loadComplexValue
(
__global
const
real2*
restrict
in,
int
x,
int
y,
int
z
)
{
const
int
inputZSize
=
ZSIZE/2+1
;
if
(
z
<
inputZSize
)
return
in[x*YSIZE*inputZSize+y*inputZSize+z]
;
int
xp
=
(
x
==
0
?
0
:
XSIZE-x
)
;
int
yp
=
(
y
==
0
?
0
:
YSIZE-y
)
;
real2
value
=
in[xp*YSIZE*inputZSize+yp*inputZSize+
(
ZSIZE-z
)
]
;
return
(
real2
)
(
value.x,
-value.y
)
;
}
/**
/**
*
Repack
the
symmetric
complex
grid
into
one
half
as
large
in
preparation
for
doing
an
inverse
complex-to-real
transform.
*
Repack
the
symmetric
complex
grid
into
one
half
as
large
in
preparation
for
doing
an
inverse
complex-to-real
transform.
*/
*/
...
@@ -106,16 +123,16 @@ __kernel void packBackwardData(__global const real2* restrict in, __global real2
...
@@ -106,16 +123,16 @@ __kernel void packBackwardData(__global const real2* restrict in, __global real2
int
xp
=
(
x
==
0
?
0
:
PACKED_XSIZE-x
)
;
int
xp
=
(
x
==
0
?
0
:
PACKED_XSIZE-x
)
;
int
yp
=
(
y
==
0
?
0
:
PACKED_YSIZE-y
)
;
int
yp
=
(
y
==
0
?
0
:
PACKED_YSIZE-y
)
;
int
zp
=
(
z
==
0
?
0
:
PACKED_ZSIZE-z
)
;
int
zp
=
(
z
==
0
?
0
:
PACKED_ZSIZE-z
)
;
real2
z1
=
in[x*YSIZE*ZSIZE+y*ZSIZE+z]
;
real2
z1
=
loadComplexValue
(
in,
x,
y,
z
)
;
#
if
PACKED_AXIS
==
0
#
if
PACKED_AXIS
==
0
real2
wfac
=
w[x]
;
real2
wfac
=
w[x]
;
real2
z2
=
in[
(
PACKED_XSIZE-x
)
*YSIZE*ZSIZE+yp*
ZSIZE+
zp
]
;
real2
z2
=
loadComplexValue
(
in,
PACKED_XSIZE-x,
yp,
zp
)
;
#
elif
PACKED_AXIS
==
1
#
elif
PACKED_AXIS
==
1
real2
wfac
=
w[y]
;
real2
wfac
=
w[y]
;
real2
z2
=
in[xp*YSIZE*ZSIZE+
(
PACKED_YSIZE-y
)
*ZSIZE+
zp
]
;
real2
z2
=
loadComplexValue
(
in,
xp,
PACKED_YSIZE-y
,
zp
)
;
#
else
#
else
real2
wfac
=
w[z]
;
real2
wfac
=
w[z]
;
real2
z2
=
in[xp*YSIZE*ZSIZE+yp*ZSIZE+
(
PACKED_ZSIZE-z
)
]
;
real2
z2
=
loadComplexValue
(
in,
xp,
yp,
PACKED_ZSIZE-z
)
;
#
endif
#
endif
real2
even
=
(
real2
)
((
z1.x+z2.x
)
/2,
(
z1.y-z2.y
)
/2
)
;
real2
even
=
(
real2
)
((
z1.x+z2.x
)
/2,
(
z1.y-z2.y
)
/2
)
;
real2
odd
=
(
real2
)
((
z1.x-z2.x
)
/2,
(
z1.y+z2.y
)
/2
)
;
real2
odd
=
(
real2
)
((
z1.x-z2.x
)
/2,
(
z1.y+z2.y
)
/2
)
;
...
...
platforms/opencl/src/kernels/pme.cl
View file @
ad62d81e
...
@@ -294,17 +294,18 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
...
@@ -294,17 +294,18 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
}
}
#endif
#endif
__kernel
void
reciprocalConvolution
(
__global
real2*
restrict
pmeGrid,
__global
real*
restrict
energyBuffer,
__global
const
real*
restrict
pmeBsplineModuliX,
__kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global const real* restrict pmeBsplineModuliX,
__global
const
real*
restrict
pmeBsplineModuliY,
__global
const
real*
restrict
pmeBsplineModuliZ,
real4
recipBoxVecX,
real4
recipBoxVecY,
real4
recipBoxVecZ,
real
recipScaleFactor
)
{
__global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
const
unsigned
int
gridSize
=
GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
;
// R2C stores into a half complex matrix where the last dimension is cut by half
real
energy
=
0.0f
;
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
int
kx
=
index/
(
GRID_SIZE_Y*GRID_SIZE_Z
)
;
// real indices
int
remainder
=
index-kx*GRID_SIZE_Y*GRID_SIZE_Z
;
int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
int
ky
=
remainder/GRID_SIZE_Z
;
int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z/2+1);
int
kz
=
remainder-ky*GRID_SIZE_Z
;
int ky = remainder/(GRID_SIZE_Z/2+1);
if
(
kx
==
0
&&
ky
==
0
&&
kz
==
0
)
int kz = remainder-ky*(GRID_SIZE_Z/2+1);
continue
;
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
...
@@ -318,8 +319,48 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global r
...
@@ -318,8 +319,48 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global r
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real denom = m2*bx*by*bz;
real denom = m2*bx*by*bz;
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
pmeGrid[index]
=
(
real2
)
(
grid.x*eterm,
grid.y*eterm
)
;
if (kx != 0 || ky != 0 || kz != 0) {
energy
+=
eterm*
(
grid.x*grid.x
+
grid.y*grid.y
)
;
pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
}
}
}
__kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global real* restrict energyBuffer,
__global const real* restrict pmeBsplineModuliX, __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ,
real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
// R2C stores into a half complex matrix where the last dimension is cut by half
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
real energy = 0;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
// real indices
int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z));
int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z);
int ky = remainder/(GRID_SIZE_Z);
int kz = remainder-ky*(GRID_SIZE_Z);
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
real mhx = mx*recipBoxVecX.x;
real mhy = mx*recipBoxVecY.x+my*recipBoxVecY.y;
real mhz = mx*recipBoxVecZ.x+my*recipBoxVecZ.y+mz*recipBoxVecZ.z;
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real bx = pmeBsplineModuliX[kx];
real by = pmeBsplineModuliY[ky];
real bz = pmeBsplineModuliZ[kz];
real denom = m2*bx*by*bz;
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
if (kz >= (GRID_SIZE_Z/2+1)) {
kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
kz = GRID_SIZE_Z-kz;
}
int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
real2 grid = pmeGrid[indexInHalfComplexGrid];
if (kx != 0 || ky != 0 |
|
kz
!=
0
)
{
energy
+=
eterm*
(
grid.x*grid.x
+
grid.y*grid.y
)
;
}
}
}
energyBuffer[get_global_id
(
0
)
]
+=
0.5f*energy
;
energyBuffer[get_global_id
(
0
)
]
+=
0.5f*energy
;
}
}
...
...
platforms/opencl/tests/TestOpenCLFFT.cpp
View file @
ad62d81e
...
@@ -85,10 +85,15 @@ void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
...
@@ -85,10 +85,15 @@ void testTransform(bool realToComplex, int xsize, int ysize, int zsize) {
fftpack_t
plan
;
fftpack_t
plan
;
fftpack_init_3d
(
&
plan
,
xsize
,
ysize
,
zsize
);
fftpack_init_3d
(
&
plan
,
xsize
,
ysize
,
zsize
);
fftpack_exec_3d
(
plan
,
FFTPACK_FORWARD
,
&
reference
[
0
],
&
reference
[
0
]);
fftpack_exec_3d
(
plan
,
FFTPACK_FORWARD
,
&
reference
[
0
],
&
reference
[
0
]);
for
(
int
i
=
0
;
i
<
(
int
)
result
.
size
();
++
i
)
{
int
outputZSize
=
(
realToComplex
?
zsize
/
2
+
1
:
zsize
);
ASSERT_EQUAL_TOL
(
reference
[
i
].
re
,
result
[
i
].
x
,
1e-3
);
for
(
int
x
=
0
;
x
<
xsize
;
x
++
)
ASSERT_EQUAL_TOL
(
reference
[
i
].
im
,
result
[
i
].
y
,
1e-3
);
for
(
int
y
=
0
;
y
<
ysize
;
y
++
)
}
for
(
int
z
=
0
;
z
<
outputZSize
;
z
++
)
{
int
index1
=
x
*
ysize
*
zsize
+
y
*
zsize
+
z
;
int
index2
=
x
*
ysize
*
outputZSize
+
y
*
outputZSize
+
z
;
ASSERT_EQUAL_TOL
(
reference
[
index1
].
re
,
result
[
index2
].
x
,
1e-3
);
ASSERT_EQUAL_TOL
(
reference
[
index1
].
im
,
result
[
index2
].
y
,
1e-3
);
}
fftpack_destroy
(
plan
);
fftpack_destroy
(
plan
);
// Perform a backward transform and see if we get the original values.
// Perform a backward transform and see if we get the original values.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment