Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
ce74c6ae
"platforms/cuda/tests/TestCudaPythonForce.cpp" did not exist on "83d57922865f4fc7714611e172970377a65a7c00"
Commit
ce74c6ae
authored
Oct 25, 2011
by
Peter Eastman
Browse files
More changes from Tony to reduce local memory use
parent
3f64d970
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
64 additions
and
45 deletions
+64
-45
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+10
-13
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+1
-1
platforms/opencl/src/kernels/gbsaObc_cpu.cl
platforms/opencl/src/kernels/gbsaObc_cpu.cl
+2
-2
platforms/opencl/src/kernels/gbsaObc_default.cl
platforms/opencl/src/kernels/gbsaObc_default.cl
+40
-22
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
+5
-2
platforms/opencl/src/kernels/nonbonded_default.cl
platforms/opencl/src/kernels/nonbonded_default.cl
+4
-3
platforms/opencl/src/kernels/nonbonded_nvidia.cl
platforms/opencl/src/kernels/nonbonded_nvidia.cl
+2
-2
No files found.
platforms/opencl/src/OpenCLKernels.cpp
View file @
ce74c6ae
...
...
@@ -1517,8 +1517,7 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
defines
[
"NUM_ATOMS"
]
=
intToString
(
cl
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
intToString
(
cl
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
cl
.
getNumAtomBlocks
());
if
(
cl
.
getSIMDWidth
()
==
32
)
defines
[
"WARPS_PER_GROUP"
]
=
OpenCLExpressionUtilities
::
intToString
(
cl
.
getNonbondedUtilities
().
getForceThreadBlockSize
()
/
OpenCLContext
::
TileSize
);
defines
[
"FORCE_WORK_GROUP_SIZE"
]
=
OpenCLExpressionUtilities
::
intToString
(
nb
.
getForceThreadBlockSize
());
string
file
;
if
(
deviceIsCpu
)
file
=
OpenCLKernelSources
::
gbsaObc_cpu
;
...
...
@@ -1534,7 +1533,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getPosq
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
params
->
getDeviceBuffer
());
computeBornSumKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
7
*
sizeof
(
cl_float
),
NULL
);
computeBornSumKernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
1
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
cl_float
),
NULL
);
if
(
nb
.
getUseCutoff
())
{
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
...
...
@@ -1557,7 +1555,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getPosq
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
bornRadii
->
getDeviceBuffer
());
force1Kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
:
nb
.
getForceThreadBlockSize
())
*
9
*
sizeof
(
cl_float
),
NULL
);
force1Kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
1
:
nb
.
getForceThreadBlockSize
())
*
sizeof
(
mm_float4
),
NULL
);
if
(
nb
.
getUseCutoff
())
{
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
nb
.
getInteractionCount
().
getDeviceBuffer
());
...
...
@@ -1596,19 +1593,19 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
reduceBornForceKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
obcChain
->
getDeviceBuffer
());
}
if
(
nb
.
getUseCutoff
())
{
computeBornSumKernel
.
setArg
<
mm_float4
>
(
7
,
cl
.
getPeriodicBoxSize
());
computeBornSumKernel
.
setArg
<
mm_float4
>
(
8
,
cl
.
getInvPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
9
,
cl
.
getPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
10
,
cl
.
getInvPeriodicBoxSize
());
computeBornSumKernel
.
setArg
<
mm_float4
>
(
6
,
cl
.
getPeriodicBoxSize
());
computeBornSumKernel
.
setArg
<
mm_float4
>
(
7
,
cl
.
getInvPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
8
,
cl
.
getPeriodicBoxSize
());
force1Kernel
.
setArg
<
mm_float4
>
(
9
,
cl
.
getInvPeriodicBoxSize
());
if
(
maxTiles
<
nb
.
getInteractingTiles
().
getSize
())
{
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl_uint
>
(
9
,
maxTiles
);
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
7
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl_uint
>
(
1
1
,
maxTiles
);
computeBornSumKernel
.
setArg
<
cl_uint
>
(
8
,
maxTiles
);
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
6
,
nb
.
getInteractingTiles
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl_uint
>
(
1
0
,
maxTiles
);
if
(
cl
.
getSIMDWidth
()
==
32
||
deviceIsCpu
)
{
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
10
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
1
2
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
computeBornSumKernel
.
setArg
<
cl
::
Buffer
>
(
9
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
force1Kernel
.
setArg
<
cl
::
Buffer
>
(
1
1
,
nb
.
getInteractionFlags
().
getDeviceBuffer
());
}
}
}
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
ce74c6ae
...
...
@@ -480,7 +480,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines
[
"USE_EXCLUSIONS"
]
=
"1"
;
if
(
isSymmetric
)
defines
[
"USE_SYMMETRIC"
]
=
"1"
;
defines
[
"
NONBONDED
_WORK_GROUP_SIZE"
]
=
OpenCLExpressionUtilities
::
intToString
(
forceThreadBlockSize
);
defines
[
"
FORCE
_WORK_GROUP_SIZE"
]
=
OpenCLExpressionUtilities
::
intToString
(
forceThreadBlockSize
);
defines
[
"CUTOFF_SQUARED"
]
=
OpenCLExpressionUtilities
::
doubleToString
(
cutoff
*
cutoff
);
defines
[
"NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getPaddedNumAtoms
());
...
...
platforms/opencl/src/kernels/gbsaObc_cpu.cl
View file @
ce74c6ae
...
...
@@ -15,7 +15,7 @@ typedef struct {
*/
__kernel
void
computeBornSum
(
__global
float*
restrict
global_bornSum,
__global
const
float4*
restrict
posq,
__global
const
float2*
restrict
global_params,
__local
AtomData*
restrict
localData,
__local
float*
restrict
tempBuffer,
__local
AtomData*
restrict
localData,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
)
{
#
else
...
...
@@ -192,7 +192,7 @@ __kernel void computeBornSum(__global float* restrict global_bornSum, __global c
__kernel void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict energyBuffer,
__global const float4* restrict posq, __global const float* restrict global_bornRadii, __global float* restrict global_bornForce,
__local AtomData* restrict localData,
__local float4* restrict tempBuffer,
__local AtomData* restrict localData,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
#else
...
...
platforms/opencl/src/kernels/gbsaObc_default.cl
View file @
ce74c6ae
...
...
@@ -11,9 +11,9 @@ typedef struct {
*
Compute
the
Born
sum.
*/
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
FORCE_
WORK_GROUP_SIZE,
1
,
1
)))
void
computeBornSum
(
__global
float*
restrict
global_bornSum,
__global
const
float4*
restrict
posq,
__global
const
float2*
restrict
global_params,
__local
AtomData1*
restrict
localData,
__local
float*
restrict
tempBuffer,
__local
AtomData1*
restrict
localData,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles
)
{
#
else
...
...
@@ -28,6 +28,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
float
tempBuffer[FORCE_WORK_GROUP_SIZE/2]
;
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
...
...
@@ -94,9 +95,8 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
tempBuffer[
get_local_id(0)
] = bornSum;
tempBuffer[
tgx
] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
...
...
@@ -104,7 +104,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
#else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
global_bornSum[offset] += bornSum+tempBuffer[
get_local_id(0)+TILE_SIZE
];
global_bornSum[offset] += bornSum+tempBuffer[
tgx
];
}
}
else {
...
...
@@ -174,7 +174,7 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
tempBuffer[
get_local_id(0)
] = bornSum;
tempBuffer[
tgx
] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
...
...
@@ -184,8 +184,13 @@ void computeBornSum(__global float* restrict global_bornSum, __global const floa
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
global_bornSum[offset1] += bornSum+tempBuffer[get_local_id(0)+TILE_SIZE];
global_bornSum[offset2] += localData[get_local_id(0)].bornSum+localData[get_local_id(0)+TILE_SIZE].bornSum;
// Do both loads before both stores to minimize store-load waits.
float sum1 = global_bornSum[offset1];
float sum2 = global_bornSum[offset2];
sum1 += bornSum + tempBuffer[tgx];
sum2 += localData[get_local_id(0)].bornSum + localData[get_local_id(0)+TILE_SIZE].bornSum;
global_bornSum[offset1] = sum1;
global_bornSum[offset2] = sum2;
}
}
lasty = y;
...
...
@@ -204,10 +209,10 @@ typedef struct {
* First part of computing the GBSA interaction.
*/
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
__kernel __attribute__((reqd_work_group_size(
FORCE_
WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* restrict localData,
__local float4* restrict tempBuffer,
__local AtomData2* restrict localData,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles) {
#else
...
...
@@ -223,6 +228,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
#endif
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local float4 tempBuffer[FORCE_WORK_GROUP_SIZE/2];
while (pos < end) {
// Extract the coordinates of this tile
...
...
@@ -295,7 +301,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
tempBuffer[
get_local_id
(
0
)
]
=
force
;
tempBuffer[
tgx
]
=
force
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
...
...
@@ -303,8 +309,13 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
#
else
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
forceBuffers[offset].xyz
=
forceBuffers[offset].xyz+force.xyz+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
global_bornForce[offset]
+=
force.w+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].w
;
//
Cheaper
to
load/store
float4
than
float3.
Do
all
loads
before
all
stores
to
minimize
store-load
waits.
float4
sum
=
forceBuffers[offset]
;
float
global_sum
=
global_bornForce[offset]
;
sum.xyz
+=
force.xyz
+
tempBuffer[tgx].xyz
;
global_sum
+=
force.w
+
tempBuffer[tgx].w
;
forceBuffers[offset]
=
sum
;
global_bornForce[offset]
=
global_sum
;
}
}
else
{
...
...
@@ -370,7 +381,7 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
tempBuffer[
get_local_id
(
0
)
]
=
force
;
tempBuffer[
tgx
]
=
force
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
...
...
@@ -380,14 +391,21 @@ void computeGBSAForce1(__global float4* restrict forceBuffers, __global float* r
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
forceBuffers[offset1].xyz
=
forceBuffers[offset1].xyz+force.xyz+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
float4
sum
=
(
float4
)
(
localData[get_local_id
(
0
)
].fx+localData[get_local_id
(
0
)
+TILE_SIZE].fx,
localData[get_local_id
(
0
)
].fy+localData[get_local_id
(
0
)
+TILE_SIZE].fy,
localData[get_local_id
(
0
)
].fz+localData[get_local_id
(
0
)
+TILE_SIZE].fz,
localData[get_local_id
(
0
)
].fw+localData[get_local_id
(
0
)
+TILE_SIZE].fw
)
;
forceBuffers[offset2].xyz
=
forceBuffers[offset2].xyz+sum.xyz
;
global_bornForce[offset1]
+=
force.w+tempBuffer[get_local_id
(
0
)
+TILE_SIZE].w
;
global_bornForce[offset2]
+=
sum.w
;
//
Cheaper
to
load/store
float4
than
float3.
Do
all
loads
before
all
stores
to
minimize
store-load
waits.
float4
sum1
=
forceBuffers[offset1]
;
float4
sum2
=
forceBuffers[offset2]
;
float
global_sum1
=
global_bornForce[offset2]
;
float
global_sum2
=
global_bornForce[offset2]
;
sum1.xyz
+=
force.xyz
+
tempBuffer[tgx].xyz
;
global_sum1
+=
force.w
+
tempBuffer[tgx].w
;
sum2.x
+=
localData[get_local_id
(
0
)
].fx
+
localData[get_local_id
(
0
)
+TILE_SIZE].fx
;
sum2.y
+=
localData[get_local_id
(
0
)
].fy
+
localData[get_local_id
(
0
)
+TILE_SIZE].fy
;
sum2.z
+=
localData[get_local_id
(
0
)
].fz
+
localData[get_local_id
(
0
)
+TILE_SIZE].fz
;
global_sum2
+=
localData[get_local_id
(
0
)
].fw
+
localData[get_local_id
(
0
)
+TILE_SIZE].fw
;
forceBuffers[offset1]
=
sum1
;
forceBuffers[offset2]
=
sum2
;
global_bornForce[offset1]
=
global_sum1
;
global_bornForce[offset2]
=
global_sum2
;
}
}
lasty
=
y
;
...
...
platforms/opencl/src/kernels/gbsaObc_nvidia.cl
View file @
ce74c6ae
...
...
@@ -3,6 +3,7 @@
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
#
define
TILE_SIZE
32
#
define
WARPS_PER_GROUP
(
FORCE_WORK_GROUP_SIZE/TILE_SIZE
)
typedef
struct
{
float
x,
y,
z
;
...
...
@@ -21,7 +22,7 @@ __kernel void computeBornSum(
__global
float*
restrict
global_bornSum,
#
endif
__global
const
float4*
restrict
posq,
__global
const
float2*
restrict
global_params,
__local
AtomData1*
restrict
localData,
__local
float*
restrict
tempBuffer,
__local
AtomData1*
restrict
localData,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags,
#
else
...
...
@@ -39,6 +40,7 @@ __kernel void computeBornSum(
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
#
endif
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
float
tempBuffer[FORCE_WORK_GROUP_SIZE]
;
__local
int2
reservedBlocks[WARPS_PER_GROUP]
;
__local
unsigned
int*
exclusionRange
=
(
__local
unsigned
int*
)
reservedBlocks
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
...
...
@@ -342,7 +344,7 @@ __kernel void computeGBSAForce1(
__global float4* restrict forceBuffers, __global float* restrict global_bornForce,
#endif
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const float* restrict global_bornRadii,
__local AtomData2* restrict localData,
__local float4* restrict tempBuffer,
__local AtomData2* restrict localData,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
#else
...
...
@@ -361,6 +363,7 @@ __kernel void computeGBSAForce1(
#endif
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local float4 tempBuffer[FORCE_WORK_GROUP_SIZE];
__local int2 reservedBlocks[WARPS_PER_GROUP];
__local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;
__local int exclusionIndex[WARPS_PER_GROUP];
...
...
platforms/opencl/src/kernels/nonbonded_default.cl
View file @
ce74c6ae
...
...
@@ -11,7 +11,7 @@ typedef struct {
*
Compute
nonbonded
interactions.
*/
__kernel
__attribute__
((
reqd_work_group_size
(
NONBONDED
_WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
FORCE
_WORK_GROUP_SIZE,
1
,
1
)))
void
computeNonbonded
(
__global
float4*
restrict
forceBuffers,
__global
float*
restrict
energyBuffer,
__global
const
float4*
restrict
posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
AtomData*
restrict
localData,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
...
...
@@ -31,7 +31,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#
endif
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
float
tempBuffer[3*
(
NONBONDED
_WORK_GROUP_SIZE/2
)
]
;
__local
float
tempBuffer[3*
(
FORCE
_WORK_GROUP_SIZE/2
)
]
;
__local
unsigned
int
exclusionRange[2]
;
__local
int
exclusionIndex[1]
;
...
...
@@ -138,6 +138,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
#else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
// Cheaper to load/store float4 than float3.
float4 sum = forceBuffers[offset];
sum += force + (float4) (tempBuffer[bufferIndex], tempBuffer[bufferIndex+1], tempBuffer[bufferIndex+2], 0.0f);
forceBuffers[offset] = sum;
...
...
@@ -232,7 +233,7 @@ void computeNonbonded(__global float4* restrict forceBuffers, __global float* re
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
//
Cheaper
to
load/store
float4
than
float3.
Do
both
loads
before
both
stores
to
minimize
store-load
waits.
//
Cheaper
to
load/store
float4
than
float3.
Do
all
loads
before
all
stores
to
minimize
store-load
waits.
float4
sum1
=
forceBuffers[offset1]
;
float4
sum2
=
forceBuffers[offset2]
;
sum1
+=
force
+
(
float4
)
(
tempBuffer[bufferIndex],
tempBuffer[bufferIndex+1],
tempBuffer[bufferIndex+2],
0.0f
)
;
...
...
platforms/opencl/src/kernels/nonbonded_nvidia.cl
View file @
ce74c6ae
...
...
@@ -3,7 +3,7 @@
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
#
define
TILE_SIZE
32
#
define
WARPS_PER_GROUP
(
NONBONDED
_WORK_GROUP_SIZE/TILE_SIZE
)
#
define
WARPS_PER_GROUP
(
FORCE
_WORK_GROUP_SIZE/TILE_SIZE
)
typedef
struct
{
float
x,
y,
z
;
...
...
@@ -41,7 +41,7 @@ __kernel void computeNonbonded(
unsigned
int
end
=
startTileIndex+
(
warp+1
)
*numTiles/totalWarps
;
#
endif
float
energy
=
0.0f
;
__local
float
tempBuffer[3*
NONBONDED
_WORK_GROUP_SIZE]
;
__local
float
tempBuffer[3*
FORCE
_WORK_GROUP_SIZE]
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int2*
reservedBlocks
=
(
__local
int2*
)
exclusionRange
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment