Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
ec3ee586
Commit
ec3ee586
authored
Jul 05, 2011
by
Peter Eastman
Browse files
Implemented faster version of charge spreading on GPUs that support 64 bit atomics
parent
548efdfb
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
76 additions
and
2 deletions
+76
-2
platforms/opencl/src/OpenCLContext.cpp
platforms/opencl/src/OpenCLContext.cpp
+15
-0
platforms/opencl/src/OpenCLContext.h
platforms/opencl/src/OpenCLContext.h
+7
-0
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+13
-1
platforms/opencl/src/OpenCLKernels.h
platforms/opencl/src/OpenCLKernels.h
+1
-0
platforms/opencl/src/kernels/pme.cl
platforms/opencl/src/kernels/pme.cl
+40
-1
No files found.
platforms/opencl/src/OpenCLContext.cpp
View file @
ec3ee586
...
@@ -48,6 +48,9 @@ using namespace std;
...
@@ -48,6 +48,9 @@ using namespace std;
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#endif
#endif
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
#endif
const
int
OpenCLContext
::
ThreadBlockSize
=
64
;
const
int
OpenCLContext
::
ThreadBlockSize
=
64
;
const
int
OpenCLContext
::
TileSize
=
32
;
const
int
OpenCLContext
::
TileSize
=
32
;
...
@@ -90,10 +93,20 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::
...
@@ -90,10 +93,20 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::
throw
OpenMMException
(
"The specified OpenCL device is not compatible with OpenMM"
);
throw
OpenMMException
(
"The specified OpenCL device is not compatible with OpenMM"
);
compilationOptions
=
"-DWORK_GROUP_SIZE="
+
OpenCLExpressionUtilities
::
intToString
(
ThreadBlockSize
);
compilationOptions
=
"-DWORK_GROUP_SIZE="
+
OpenCLExpressionUtilities
::
intToString
(
ThreadBlockSize
);
defaultOptimizationOptions
=
"-cl-fast-relaxed-math"
;
defaultOptimizationOptions
=
"-cl-fast-relaxed-math"
;
supports64BitGlobalAtomics
=
(
device
.
getInfo
<
CL_DEVICE_EXTENSIONS
>
().
find
(
"cl_khr_int64_base_atomics"
)
!=
string
::
npos
);
string
vendor
=
device
.
getInfo
<
CL_DEVICE_VENDOR
>
();
string
vendor
=
device
.
getInfo
<
CL_DEVICE_VENDOR
>
();
if
(
vendor
.
size
()
>=
6
&&
vendor
.
substr
(
0
,
6
)
==
"NVIDIA"
)
{
if
(
vendor
.
size
()
>=
6
&&
vendor
.
substr
(
0
,
6
)
==
"NVIDIA"
)
{
compilationOptions
+=
" -DWARPS_ARE_ATOMIC"
;
compilationOptions
+=
" -DWARPS_ARE_ATOMIC"
;
simdWidth
=
32
;
simdWidth
=
32
;
// Compute level 1.2 and later Nvidia GPUs support 64 bit atomics, even though they don't list the
// proper extension as supported.
cl_uint
computeCapabilityMajor
,
computeCapabilityMinor
;
clGetDeviceInfo
(
device
(),
CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
,
sizeof
(
cl_uint
),
&
computeCapabilityMajor
,
NULL
);
clGetDeviceInfo
(
device
(),
CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
,
sizeof
(
cl_uint
),
&
computeCapabilityMinor
,
NULL
);
if
(
computeCapabilityMajor
>
1
||
computeCapabilityMinor
>
1
)
supports64BitGlobalAtomics
=
true
;
}
}
else
if
(
vendor
.
size
()
>=
28
&&
vendor
.
substr
(
0
,
28
)
==
"Advanced Micro Devices, Inc."
)
{
else
if
(
vendor
.
size
()
>=
28
&&
vendor
.
substr
(
0
,
28
)
==
"Advanced Micro Devices, Inc."
)
{
// AMD APP SDK 2.4 has a performance problem with atomics. Enable the work around.
// AMD APP SDK 2.4 has a performance problem with atomics. Enable the work around.
...
@@ -104,6 +117,8 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::
...
@@ -104,6 +117,8 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::
}
}
else
else
simdWidth
=
1
;
simdWidth
=
1
;
if
(
supports64BitGlobalAtomics
)
compilationOptions
+=
" -DSUPPORTS_64_BIT_ATOMICS"
;
queue
=
cl
::
CommandQueue
(
context
,
device
);
queue
=
cl
::
CommandQueue
(
context
,
device
);
numAtoms
=
numParticles
;
numAtoms
=
numParticles
;
paddedNumAtoms
=
TileSize
*
((
numParticles
+
TileSize
-
1
)
/
TileSize
);
paddedNumAtoms
=
TileSize
*
((
numParticles
+
TileSize
-
1
)
/
TileSize
);
...
...
platforms/opencl/src/OpenCLContext.h
View file @
ec3ee586
...
@@ -399,6 +399,12 @@ public:
...
@@ -399,6 +399,12 @@ public:
int
getSIMDWidth
()
const
{
int
getSIMDWidth
()
const
{
return
simdWidth
;
return
simdWidth
;
}
}
/**
* Get whether the device being used supports 64 bit atomic operations on global memory.
*/
bool
getSupports64BitGlobalAtomics
()
{
return
supports64BitGlobalAtomics
;
}
/**
/**
* Get the size of the periodic box.
* Get the size of the periodic box.
*/
*/
...
@@ -458,6 +464,7 @@ private:
...
@@ -458,6 +464,7 @@ private:
int
numThreadBlocks
;
int
numThreadBlocks
;
int
numForceBuffers
;
int
numForceBuffers
;
int
simdWidth
;
int
simdWidth
;
bool
supports64BitGlobalAtomics
;
mm_float4
periodicBoxSize
;
mm_float4
periodicBoxSize
;
mm_float4
invPeriodicBoxSize
;
mm_float4
invPeriodicBoxSize
;
std
::
string
compilationOptions
,
defaultOptimizationOptions
;
std
::
string
compilationOptions
,
defaultOptimizationOptions
;
...
...
platforms/opencl/src/OpenCLKernels.cpp
View file @
ec3ee586
...
@@ -1445,6 +1445,10 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -1445,6 +1445,10 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeBsplineTheta
->
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeBsplineTheta
->
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeBsplineDtheta
->
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeBsplineDtheta
->
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
pmeGrid
->
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
pmeGrid
->
getDeviceBuffer
());
if
(
cl
.
getSupports64BitGlobalAtomics
())
{
pmeFinishSpreadChargeKernel
=
cl
::
Kernel
(
program
,
"finishSpreadCharge"
);
pmeFinishSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
pmeGrid
->
getDeviceBuffer
());
}
}
}
}
}
if
(
exceptionIndices
!=
NULL
)
if
(
exceptionIndices
!=
NULL
)
...
@@ -1476,7 +1480,15 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -1476,7 +1480,15 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeAtomRangeKernel
.
setArg
<
mm_float4
>
(
3
,
boxSize
);
pmeAtomRangeKernel
.
setArg
<
mm_float4
>
(
3
,
boxSize
);
pmeAtomRangeKernel
.
setArg
<
mm_float4
>
(
4
,
invBoxSize
);
pmeAtomRangeKernel
.
setArg
<
mm_float4
>
(
4
,
invBoxSize
);
cl
.
executeKernel
(
pmeAtomRangeKernel
,
cl
.
getNumAtoms
());
cl
.
executeKernel
(
pmeAtomRangeKernel
,
cl
.
getNumAtoms
());
cl
.
executeKernel
(
pmeSpreadChargeKernel
,
cl
.
getNumAtoms
());
if
(
cl
.
getSupports64BitGlobalAtomics
())
{
cl
.
clearBuffer
(
pmeGrid
->
getDeviceBuffer
(),
pmeGrid
->
getSize
()
*
2
);
pmeSpreadChargeKernel
.
setArg
<
mm_float4
>
(
5
,
boxSize
);
pmeSpreadChargeKernel
.
setArg
<
mm_float4
>
(
6
,
invBoxSize
);
cl
.
executeKernel
(
pmeSpreadChargeKernel
,
cl
.
getNumAtoms
(),
PmeOrder
*
PmeOrder
*
PmeOrder
);
cl
.
executeKernel
(
pmeFinishSpreadChargeKernel
,
pmeGrid
->
getSize
());
}
else
cl
.
executeKernel
(
pmeSpreadChargeKernel
,
cl
.
getNumAtoms
());
}
}
fft
->
execFFT
(
*
pmeGrid
,
true
);
fft
->
execFFT
(
*
pmeGrid
,
true
);
pmeConvolutionKernel
.
setArg
<
mm_float4
>
(
5
,
invBoxSize
);
pmeConvolutionKernel
.
setArg
<
mm_float4
>
(
5
,
invBoxSize
);
...
...
platforms/opencl/src/OpenCLKernels.h
View file @
ec3ee586
...
@@ -521,6 +521,7 @@ private:
...
@@ -521,6 +521,7 @@ private:
cl
::
Kernel
pmeAtomRangeKernel
;
cl
::
Kernel
pmeAtomRangeKernel
;
cl
::
Kernel
pmeUpdateBsplinesKernel
;
cl
::
Kernel
pmeUpdateBsplinesKernel
;
cl
::
Kernel
pmeSpreadChargeKernel
;
cl
::
Kernel
pmeSpreadChargeKernel
;
cl
::
Kernel
pmeFinishSpreadChargeKernel
;
cl
::
Kernel
pmeConvolutionKernel
;
cl
::
Kernel
pmeConvolutionKernel
;
cl
::
Kernel
pmeInterpolateForceKernel
;
cl
::
Kernel
pmeInterpolateForceKernel
;
std
::
map
<
std
::
string
,
std
::
string
>
pmeDefines
;
std
::
map
<
std
::
string
,
std
::
string
>
pmeDefines
;
...
...
platforms/opencl/src/kernels/pme.cl
View file @
ec3ee586
...
@@ -78,6 +78,44 @@ __kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int
...
@@ -78,6 +78,44 @@ __kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int
}
}
}
}
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
__kernel
void
gridSpreadCharge
(
__global
float4*
posq,
__global
int2*
pmeAtomGridIndex,
__global
int*
pmeAtomRange,
__global
long*
pmeGrid,
__global
float4*
pmeBsplineTheta,
float4
periodicBoxSize,
float4
invPeriodicBoxSize
)
{
int
ix
=
get_local_id
(
0
)
/
(
PME_ORDER*PME_ORDER
)
;
int
remainder
=
get_local_id
(
0
)
-ix*PME_ORDER*PME_ORDER
;
int
iy
=
remainder/PME_ORDER
;
int
iz
=
remainder-iy*PME_ORDER
;
if
(
ix
<
PME_ORDER
)
{
for
(
int
atomIndex
=
get_group_id
(
0
)
; atomIndex < NUM_ATOMS; atomIndex += get_num_groups(0)) {
float4
pos
=
posq[atomIndex]
;
float
atomCharge
=
pos.w
;
float
add
=
atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z
;
pos.x
-=
floor
(
pos.x*invPeriodicBoxSize.x
)
*periodicBoxSize.x
;
pos.y
-=
floor
(
pos.y*invPeriodicBoxSize.y
)
*periodicBoxSize.y
;
pos.z
-=
floor
(
pos.z*invPeriodicBoxSize.z
)
*periodicBoxSize.z
;
int
x
=
(
int
)
((
pos.x*invPeriodicBoxSize.x
)
*GRID_SIZE_X
)
+ix
;
int
y
=
(
int
)
((
pos.y*invPeriodicBoxSize.y
)
*GRID_SIZE_Y
)
+iy
;
int
z
=
(
int
)
((
pos.z*invPeriodicBoxSize.z
)
*GRID_SIZE_Z
)
+iz
;
x
-=
(
x
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
)
;
y
-=
(
y
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
)
;
z
-=
(
z
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
)
;
atom_add
(
&pmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z],
(
long
)
(
add*0xFFFFFFFF
))
;
}
}
}
__kernel
void
finishSpreadCharge
(
__global
long*
pmeGrid
)
{
__global
float2*
floatGrid
=
(
__global
float2*
)
pmeGrid
;
const
unsigned
int
gridSize
=
GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
;
float
scale
=
EPSILON_FACTOR/
(
float
)
0xFFFFFFFF
;
for
(
int
index
=
get_global_id
(
0
)
; index < gridSize; index += get_global_size(0)) {
long
value
=
pmeGrid[index]
;
float2
floatValue
=
(
float2
)
((
float
)
(
value*scale
)
,
0.0f
)
;
floatGrid[index]
=
floatValue
;
}
}
#
else
__kernel
void
gridSpreadCharge
(
__global
float4*
posq,
__global
int2*
pmeAtomGridIndex,
__global
int*
pmeAtomRange,
__global
float2*
pmeGrid,
__global
float4*
pmeBsplineTheta
)
{
__kernel
void
gridSpreadCharge
(
__global
float4*
posq,
__global
int2*
pmeAtomGridIndex,
__global
int*
pmeAtomRange,
__global
float2*
pmeGrid,
__global
float4*
pmeBsplineTheta
)
{
unsigned
int
numGridPoints
=
GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
;
unsigned
int
numGridPoints
=
GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z
;
for
(
int
gridIndex
=
get_global_id
(
0
)
; gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
for
(
int
gridIndex
=
get_global_id
(
0
)
; gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
...
@@ -91,7 +129,7 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid
...
@@ -91,7 +129,7 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid
float
result
=
0.0f
;
float
result
=
0.0f
;
//
Loop
over
all
atoms
that
affect
this
grid
point.
//
Loop
over
all
atoms
that
affect
this
grid
point.
for
(
int
ix
=
0
; ix < PME_ORDER; ++ix) {
for
(
int
ix
=
0
; ix < PME_ORDER; ++ix) {
int
x
=
gridPoint.x-ix+
(
gridPoint.x
>=
ix
?
0
:
GRID_SIZE_X
)
;
int
x
=
gridPoint.x-ix+
(
gridPoint.x
>=
ix
?
0
:
GRID_SIZE_X
)
;
for
(
int
iy
=
0
; iy < PME_ORDER; ++iy) {
for
(
int
iy
=
0
; iy < PME_ORDER; ++iy) {
...
@@ -133,6 +171,7 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid
...
@@ -133,6 +171,7 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid
pmeGrid[gridIndex]
=
(
float2
)
(
result*EPSILON_FACTOR,
0.0f
)
;
pmeGrid[gridIndex]
=
(
float2
)
(
result*EPSILON_FACTOR,
0.0f
)
;
}
}
}
}
#
endif
__kernel
void
reciprocalConvolution
(
__global
float2*
pmeGrid,
__global
float*
energyBuffer,
__global
float*
pmeBsplineModuliX,
__kernel
void
reciprocalConvolution
(
__global
float2*
pmeGrid,
__global
float*
energyBuffer,
__global
float*
pmeBsplineModuliX,
__global
float*
pmeBsplineModuliY,
__global
float*
pmeBsplineModuliZ,
float4
invPeriodicBoxSize,
float
recipScaleFactor
)
{
__global
float*
pmeBsplineModuliY,
__global
float*
pmeBsplineModuliZ,
float4
invPeriodicBoxSize,
float
recipScaleFactor
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment