Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
ce74c6ae
Commit
ce74c6ae
authored
Oct 25, 2011
by
Peter Eastman
Browse files
More changes from Tony to reduce local memory use
parent
3f64d970
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
64 additions
and
45 deletions
+64
-45
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+10
-13
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+1
-1
platforms/opencl/src/kernels/gbsaObc_cpu.cl
platforms/opencl/src/kernels/gbsaObc_cpu.cl
+2
-2
platforms/opencl/src/kernels/gbsaObc_default.cl
platforms/opencl/src/kernels/gbsaObc_default.cl
+40
-22
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
+5
-2
platforms/opencl/src/kernels/nonbonded_default.cl
platforms/opencl/src/kernels/nonbonded_default.cl
+4
-3
platforms/opencl/src/kernels/nonbonded_nvidia.cl
platforms/opencl/src/kernels/nonbonded_nvidia.cl
+2
-2
No files found.
platforms/opencl/src/OpenCLKernels.cpp
View file @
ce74c6ae
...
@@ -1517,8 +1517,7 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1517,8 +1517,7 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
defines
[
"NUM_ATOMS"
]
=
intToString
(
cl
.
getNumAtoms
());
defines
[
"NUM_ATOMS"
]
=
intToString
(
cl
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
intToString
(
cl
.
getPaddedNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
intToString
(
cl
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
cl
.
getNumAtomBlocks
());
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
cl
.
getNumAtomBlocks
());
if
(
cl
.
getSIMDWidth
()
==
32
)
defines
[
"FORCE_WORK_GROUP_SIZE"
]
=
OpenCLExpressionUtilities
::
intToString
(
nb
.
getForceThreadBlockSize
());
defines
[
"WARPS_PER_GROUP"
]
=
OpenCLExpressionUtilities
::
intToString
(
cl
.
getNonbondedUtilities
().
getForceThreadBlockSize
()
/
OpenCLContext
::
TileSize
);
string
file
;
string
file
;
if
(
deviceIsCpu
)
if
(
deviceIsCpu
)
file
=
OpenCLKernelSources
::
gbsaObc_cpu
;
file
=
OpenCLKernelSources
::
gbsaObc_cpu
;
...
@@ -1534,7 +1533,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1534,7 +1533,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getPosq
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getPosq
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
params
->
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
params
->
getDeviceBuffer
());
computeBornSumKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
7
*
sizeof
(
cl_float
),
NULL
);
computeBornSumKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
7
*
sizeof
(
cl_float
),
NULL
);
computeBornSumKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
1
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float
),
NULL
);
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
...
@@ -1557,7 +1555,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1557,7 +1555,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getPosq
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getPosq
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
bornRadii
->
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
bornRadii
->
getDeviceBuffer
());
force1Kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
9
*
sizeof
(
cl_float
),
NULL
);
force1Kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
9
*
sizeof
(
cl_float
),
NULL
);
force1Kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
1
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
mm_float4
),
NULL
);
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
...
@@ -1596,19 +1593,19 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1596,19 +1593,19 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
reduceBornForceKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
obcChain
->
getDeviceBuffer
());
reduceBornForceKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
obcChain
->
getDeviceBuffer
());
}
}
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
computeBornSumKernel
.
setArg
<
mm_float4
>
(
7
,
cl
.
getPeriodicBoxSize
());
computeBornSumKernel
.
setArg
<
mm_float4
>
(
6
,
cl
.
getPeriodicBoxSize
());
computeBornSumKernel
.
setArg
<
mm_float4
>
(
8
,
cl
.
getInvPeriodicBoxSize
());
computeBornSumKernel
.
setArg
<
mm_float4
>
(
7
,
cl
.
getInvPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
9
,
cl
.
getPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
8
,
cl
.
getPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
10
,
cl
.
getInvPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
9
,
cl
.
getInvPeriodicBoxSize
());
if
(
maxTiles
<
nb
.
getInteractingTiles
().
getSize
())
{
if
(
maxTiles
<
nb
.
getInteractingTiles
().
getSize
())
{
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl_uint
>
(
9
,
maxTiles
);
computeBornSumKernel
.
setArg
<
cl_uint
>
(
8
,
maxTiles
);
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
7
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
6
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl_uint
>
(
1
1
,
maxTiles
);
force1Kernel
.
setArg
<
cl_uint
>
(
1
0
,
maxTiles
);
if
(
cl
.
getSIMDWidth
()
==
32
||
deviceIsCpu
)
{
if
(
cl
.
getSIMDWidth
()
==
32
||
deviceIsCpu
)
{
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
10
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
9
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
1
2
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
1
1
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
}
}
}
}
}
}
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
ce74c6ae
...
@@ -480,7 +480,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
...
@@ -480,7 +480,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines
[
"USE_EXCLUSIONS"
]
=
"1"
;
defines
[
"USE_EXCLUSIONS"
]
=
"1"
;
if
(
isSymmetric
)
if
(
isSymmetric
)
defines
[
"USE_SYMMETRIC"
]
=
"1"
;
defines
[
"USE_SYMMETRIC"
]
=
"1"
;
defines
[
"
NONBONDED
_WORK_GROUP_SIZE"
]
=
OpenCLExpressionUtilities
::
intToString
(
forceThreadBlockSize
);
defines
[
"
FORCE
_WORK_GROUP_SIZE"
]
=
OpenCLExpressionUtilities
::
intToString
(
forceThreadBlockSize
);
defines
[
"CUTOFF_SQUARED"
]
=
OpenCLExpressionUtilities
::
doubleToString
(
cutoff
*
cutoff
);
defines
[
"CUTOFF_SQUARED"
]
=
OpenCLExpressionUtilities
::
doubleToString
(
cutoff
*
cutoff
);
defines
[
"NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtoms
());
defines
[
"NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getPaddedNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getPaddedNumAtoms
());
...
...
platforms/opencl/src/kernels/gbsaObc_cpu.cl
View file @
ce74c6ae
...
@@ -15,7 +15,7 @@ typedef struct {
...
@@ -15,7 +15,7 @@ typedef struct {
*/
*/
__kernel
void
computeBornSum
(
__global
float*
restrict
global_bornSum,
__global
const
float4*
restrict
posq,
__global
const
float2*
restrict
global_params,
__kernel
void
computeBornSum
(
__global
float*
restrict
global_bornSum,
__global
const
float4*
restrict
posq,
__global
const
float2*
restrict
global_params,
__local
AtomData*
restrict
localData,
__local
float*
restrict
tempBuffer,
__local
AtomData*
restrict
localData,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
)
{
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
)
{
#
else
#
else
...
@@ -192,7 +192,7 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c
...
@@ -192,7 +192,7 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c
__kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict energyBuffer,
__kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict energyBuffer,
__global const float4* restrict posq, __global const float* restrict global_bornRadii, __global float* restrict global_bornForce,
__global const float4* restrict posq, __global const float* restrict global_bornRadii, __global float* restrict global_bornForce,
__local AtomData* restrict localData,
__local float4* restrict tempBuffer,
__local AtomData* restrict localData,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
#else
#else
...
...
platforms/opencl/src/kernels/gbsaObc_default.cl
View file @
ce74c6ae
...
@@ -11,9 +11,9 @@ typedef struct {
...
@@ -11,9 +11,9 @@ typedef struct {
*
Compute
the
Born
sum.
*
Compute
the
Born
sum.
*/
*/
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
FORCE_
WORK_GROUP_SIZE,
1
,
1
)))
void
computeBornSum
(
__global
float*
restrict
global_bornSum,
__global
const
float4*
restrict
posq,
__global
const
float2*
restrict
global_params,
void
computeBornSum
(
__global
float*
restrict
global_bornSum,
__global
const
float4*
restrict
posq,
__global
const
float2*
restrict
global_params,
__local
AtomData1*
restrict
localData,
__local
float*
restrict
tempBuffer,
__local
AtomData1*
restrict
localData,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles
)
{
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles
)
{
#
else
#
else
...
@@ -28,6 +28,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
...
@@ -28,6 +28,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
#
endif
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
float
tempBuffer[FORCE_WORK_GROUP_SIZE/2]
;
while
(
pos
<
end
)
{
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
//
Extract
the
coordinates
of
this
tile
...
@@ -94,9 +95,8 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
...
@@ -94,9 +95,8 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
// Sum the forces and write results.
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
if (get_local_id(0) >= TILE_SIZE)
tempBuffer[
get_local_id(0)
] = bornSum;
tempBuffer[
tgx
] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
...
@@ -104,7 +104,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
...
@@ -104,7 +104,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
#else
#else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
#endif
global_bornSum[offset] += bornSum+tempBuffer[
get_local_id(0)+TILE_SIZE
];
global_bornSum[offset] += bornSum+tempBuffer[
tgx
];
}
}
}
}
else {
else {
...
@@ -174,7 +174,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
...
@@ -174,7 +174,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
// Sum the forces and write results.
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
if (get_local_id(0) >= TILE_SIZE)
tempBuffer[
get_local_id(0)
] = bornSum;
tempBuffer[
tgx
] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
...
@@ -184,8 +184,13 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
...
@@ -184,8 +184,13 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
#endif
global_bornSum[offset1] += bornSum+tempBuffer[get_local_id(0)+TILE_SIZE];
// Do both loads before both stores to minimize store-load waits.
global_bornSum[offset2] += localData[get_local_id(0)].bornSum+localData[get_local_id(0)+TILE_SIZE].bornSum;
float sum1 = global_bornSum[offset1];
float sum2 = global_bornSum[offset2];
sum1 += bornSum + tempBuffer[tgx];
sum2 += localData[get_local_id(0)].bornSum + localData[get_local_id(0)+TILE_SIZE].bornSum;
global_bornSum[offset1] = sum1;
global_bornSum[offset2] = sum2;
}
}
}
}
lasty = y;
lasty = y;
...
@@ -204,10 +209,10 @@ typedef struct {
...
@@ -204,10 +209,10 @@ typedef struct {
* First part of computing the GBSA interaction.
* First part of computing the GBSA interaction.
*/
*/
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
__kernel __attribute__((reqd_work_group_size(
FORCE_
WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* restrict localData,
__local float4* restrict tempBuffer,
__local AtomData2* restrict localData,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else
#else
...
@@ -223,6 +228,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
...
@@ -223,6 +228,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
#endif
#endif
float energy = 0.0f;
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
unsigned int lasty = 0xFFFFFFFF;
__local float4 tempBuffer[FORCE_WORK_GROUP_SIZE/2];
while (pos < end) {
while (pos < end) {
// Extract the coordinates of this tile
// Extract the coordinates of this tile
...
@@ -295,7 +301,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
...
@@ -295,7 +301,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
//
Sum
the
forces
and
write
results.
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
tempBuffer[
get_local_id
(
0
)
]
=
force
;
tempBuffer[
tgx
]
=
force
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
...
@@ -303,8 +309,13 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
...
@@ -303,8 +309,13 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
#
else
#
else
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
#
endif
forceBuffers[offset].xyz
=
forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
//
Cheaper
to
load/store
float4
than
float3.
Do
all
loads
before
all
stores
to
minimize
store-load
waits.
global_bornForce[offset]
+=
force.w+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].w
;
float4
sum
=
forceBuffers[offset]
;
float
global_sum
=
global_bornForce[offset]
;
sum.xyz
+=
force.xyz
+
tempBuffer[tgx].xyz
;
global_sum
+=
force.w
+
tempBuffer[tgx].w
;
forceBuffers[offset]
=
sum
;
global_bornForce[offset]
=
global_sum
;
}
}
}
}
else
{
else
{
...
@@ -370,7 +381,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
...
@@ -370,7 +381,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
//
Sum
the
forces
and
write
results.
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
tempBuffer[
get_local_id
(
0
)
]
=
force
;
tempBuffer[
tgx
]
=
force
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
...
@@ -380,14 +391,21 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
...
@@ -380,14 +391,21 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
#
endif
forceBuffers[offset1].xyz
=
forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
//
Cheaper
to
load/store
float4
than
float3.
Do
all
loads
before
all
stores
to
minimize
store-load
waits.
float4
sum
=
(
float4
)
(
localData[get_local_id
(
0
)
].fx+localData[get_local_id
(
0
)
+TILE_SIZE].fx,
float4
sum1
=
forceBuffers[offset1]
;
localData[get_local_id
(
0
)
].fy+localData[get_local_id
(
0
)
+TILE_SIZE].fy,
float4
sum2
=
forceBuffers[offset2]
;
localData[get_local_id
(
0
)
].fz+localData[get_local_id
(
0
)
+TILE_SIZE].fz,
float
global_sum1
=
global_bornForce[offset2]
;
localData[get_local_id
(
0
)
].fw+localData[get_local_id
(
0
)
+TILE_SIZE].fw
)
;
float
global_sum2
=
global_bornForce[offset2]
;
forceBuffers[offset2].xyz
=
forceBuffers[offset2].xyz+sum.xyz
;
sum1.xyz
+=
force.xyz
+
tempBuffer[tgx].xyz
;
global_bornForce[offset1]
+=
force.w+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].w
;
global_sum1
+=
force.w
+
tempBuffer[tgx].w
;
global_bornForce[offset2]
+=
sum.w
;
sum2.x
+=
localData[get_local_id
(
0
)
].fx
+
localData[get_local_id
(
0
)
+TILE_SIZE].fx
;
sum2.y
+=
localData[get_local_id
(
0
)
].fy
+
localData[get_local_id
(
0
)
+TILE_SIZE].fy
;
sum2.z
+=
localData[get_local_id
(
0
)
].fz
+
localData[get_local_id
(
0
)
+TILE_SIZE].fz
;
global_sum2
+=
localData[get_local_id
(
0
)
].fw
+
localData[get_local_id
(
0
)
+TILE_SIZE].fw
;
forceBuffers[offset1]
=
sum1
;
forceBuffers[offset2]
=
sum2
;
global_bornForce[offset1]
=
global_sum1
;
global_bornForce[offset2]
=
global_sum2
;
}
}
}
}
lasty
=
y
;
lasty
=
y
;
...
...
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
View file @
ce74c6ae
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
#
endif
#
define
TILE_SIZE
32
#
define
TILE_SIZE
32
#
define
WARPS_PER_GROUP
(
FORCE_WORK_GROUP_SIZE/TILE_SIZE
)
typedef
struct
{
typedef
struct
{
float
x,
y,
z
;
float
x,
y,
z
;
...
@@ -21,7 +22,7 @@ __kernel void computeBornSum(
...
@@ -21,7 +22,7 @@ __kernel void computeBornSum(
__global
float*
restrict
global_bornSum,
__global
float*
restrict
global_bornSum,
#
endif
#
endif
__global
const
float4*
restrict
posq,
__global
const
float2*
restrict
global_params,
__global
const
float4*
restrict
posq,
__global
const
float2*
restrict
global_params,
__local
AtomData1*
restrict
localData,
__local
float*
restrict
tempBuffer,
__local
AtomData1*
restrict
localData,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags,
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags,
#
else
#
else
...
@@ -39,6 +40,7 @@ __kernel void computeBornSum(
...
@@ -39,6 +40,7 @@ __kernel void computeBornSum(
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
#
endif
#
endif
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
float
tempBuffer[FORCE_WORK_GROUP_SIZE]
;
__local
int2
reservedBlocks[WARPS_PER_GROUP]
;
__local
int2
reservedBlocks[WARPS_PER_GROUP]
;
__local
unsigned
int*
exclusionRange
=
(
__local
unsigned
int*
)
reservedBlocks
;
__local
unsigned
int*
exclusionRange
=
(
__local
unsigned
int*
)
reservedBlocks
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
...
@@ -342,7 +344,7 @@ __kernel void computeGBSAForce1(
...
@@ -342,7 +344,7 @@ __kernel void computeGBSAForce1(
__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
#endif
#endif
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* restrict localData,
__local float4* restrict tempBuffer,
__local AtomData2* restrict localData,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
#else
#else
...
@@ -361,6 +363,7 @@ __kernel void computeGBSAForce1(
...
@@ -361,6 +363,7 @@ __kernel void computeGBSAForce1(
#endif
#endif
float energy = 0.0f;
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
unsigned int lasty = 0xFFFFFFFF;
__local float4 tempBuffer[FORCE_WORK_GROUP_SIZE];
__local int2 reservedBlocks[WARPS_PER_GROUP];
__local int2 reservedBlocks[WARPS_PER_GROUP];
__local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;
__local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;
__local int exclusionIndex[WARPS_PER_GROUP];
__local int exclusionIndex[WARPS_PER_GROUP];
...
...
platforms/opencl/src/kernels/nonbonded_default.cl
View file @
ce74c6ae
...
@@ -11,7 +11,7 @@ typedef struct {
...
@@ -11,7 +11,7 @@ typedef struct {
*
Compute
nonbonded
interactions.
*
Compute
nonbonded
interactions.
*/
*/
__kernel
__attribute__
((
reqd_work_group_size
(
NONBONDED
_WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
FORCE
_WORK_GROUP_SIZE,
1
,
1
)))
void
computeNonbonded
(
__global
float4*
restrict
forceBuffers,
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
void
computeNonbonded
(
__global
float4*
restrict
forceBuffers,
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
AtomData*
restrict
localData,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
AtomData*
restrict
localData,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
...
@@ -31,7 +31,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -31,7 +31,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#
endif
#
endif
float
energy
=
0.0f
;
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
float
tempBuffer[3*
(
NONBONDED
_WORK_GROUP_SIZE/2
)
]
;
__local
float
tempBuffer[3*
(
FORCE
_WORK_GROUP_SIZE/2
)
]
;
__local
unsigned
int
exclusionRange[2]
;
__local
unsigned
int
exclusionRange[2]
;
__local
int
exclusionIndex[1]
;
__local
int
exclusionIndex[1]
;
...
@@ -138,6 +138,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -138,6 +138,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#else
#else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
#endif
// Cheaper to load/store float4 than float3.
float4 sum = forceBuffers[offset];
float4 sum = forceBuffers[offset];
sum += force + (float4) (tempBuffer[bufferIndex], tempBuffer[bufferIndex+1], tempBuffer[bufferIndex+2], 0.0f);
sum += force + (float4) (tempBuffer[bufferIndex], tempBuffer[bufferIndex+1], tempBuffer[bufferIndex+2], 0.0f);
forceBuffers[offset] = sum;
forceBuffers[offset] = sum;
...
@@ -232,7 +233,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
...
@@ -232,7 +233,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
#
endif
//
Cheaper
to
load/store
float4
than
float3.
Do
both
loads
before
both
stores
to
minimize
store-load
waits.
//
Cheaper
to
load/store
float4
than
float3.
Do
all
loads
before
all
stores
to
minimize
store-load
waits.
float4
sum1
=
forceBuffers[offset1]
;
float4
sum1
=
forceBuffers[offset1]
;
float4
sum2
=
forceBuffers[offset2]
;
float4
sum2
=
forceBuffers[offset2]
;
sum1
+=
force
+
(
float4
)
(
tempBuffer[bufferIndex],
tempBuffer[bufferIndex+1],
tempBuffer[bufferIndex+2],
0.0f
)
;
sum1
+=
force
+
(
float4
)
(
tempBuffer[bufferIndex],
tempBuffer[bufferIndex+1],
tempBuffer[bufferIndex+2],
0.0f
)
;
...
...
platforms/opencl/src/kernels/nonbonded_nvidia.cl
View file @
ce74c6ae
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
#
endif
#
define
TILE_SIZE
32
#
define
TILE_SIZE
32
#
define
WARPS_PER_GROUP
(
NONBONDED
_WORK_GROUP_SIZE/TILE_SIZE
)
#
define
WARPS_PER_GROUP
(
FORCE
_WORK_GROUP_SIZE/TILE_SIZE
)
typedef
struct
{
typedef
struct
{
float
x,
y,
z
;
float
x,
y,
z
;
...
@@ -41,7 +41,7 @@ __kernel void computeNonbonded(
...
@@ -41,7 +41,7 @@ __kernel void computeNonbonded(
unsigned
int
end
=
startTileIndex+
(
warp+1
)
*numTiles/totalWarps
;
unsigned
int
end
=
startTileIndex+
(
warp+1
)
*numTiles/totalWarps
;
#
endif
#
endif
float
energy
=
0.0f
;
float
energy
=
0.0f
;
__local
float
tempBuffer[3*
NONBONDED
_WORK_GROUP_SIZE]
;
__local
float
tempBuffer[3*
FORCE
_WORK_GROUP_SIZE]
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int2*
reservedBlocks
=
(
__local
int2*
)
exclusionRange
;
__local
int2*
reservedBlocks
=
(
__local
int2*
)
exclusionRange
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment