Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
dd352ee5
Commit
dd352ee5
authored
Apr 26, 2011
by
Peter Eastman
Browse files
Added dynamic load balancing between GPUs
parent
6e3526b4
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
106 additions
and
35 deletions
+106
-35
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+22
-9
platforms/opencl/src/OpenCLNonbondedUtilities.h
platforms/opencl/src/OpenCLNonbondedUtilities.h
+4
-0
platforms/opencl/src/OpenCLParallelKernels.cpp
platforms/opencl/src/OpenCLParallelKernels.cpp
+54
-7
platforms/opencl/src/OpenCLParallelKernels.h
platforms/opencl/src/OpenCLParallelKernels.h
+2
-0
platforms/opencl/src/kernels/findInteractingBlocks.cl
platforms/opencl/src/kernels/findInteractingBlocks.cl
+4
-3
platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
+5
-4
platforms/opencl/src/kernels/nonbonded_cpu.cl
platforms/opencl/src/kernels/nonbonded_cpu.cl
+5
-4
platforms/opencl/src/kernels/nonbonded_default.cl
platforms/opencl/src/kernels/nonbonded_default.cl
+5
-4
platforms/opencl/src/kernels/nonbonded_nvidia.cl
platforms/opencl/src/kernels/nonbonded_nvidia.cl
+5
-4
No files found.
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
dd352ee5
...
...
@@ -233,8 +233,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
if
(
useCutoff
)
{
map
<
string
,
string
>
defines
;
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtomBlocks
());
defines
[
"START_TILE_INDEX"
]
=
OpenCLExpressionUtilities
::
intToString
(
startTileIndex
);
defines
[
"END_TILE_INDEX"
]
=
OpenCLExpressionUtilities
::
intToString
(
startTileIndex
+
numTiles
);
if
(
forceBufferPerAtomBlock
)
defines
[
"USE_OUTPUT_BUFFER_PER_BLOCK"
]
=
"1"
;
if
(
usePeriodic
)
...
...
@@ -256,6 +254,8 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
context
.
getPosq
().
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
9
,
interactingTiles
->
getSize
());
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
10
,
startTileIndex
);
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
11
,
startTileIndex
+
numTiles
);
if
(
context
.
getSIMDWidth
()
==
32
&&
!
deviceIsCpu
)
{
findInteractionsWithinBlocksKernel
=
cl
::
Kernel
(
interactingBlocksProgram
,
"findInteractionsWithinBlocks"
);
findInteractionsWithinBlocksKernel
.
setArg
<
cl_float
>
(
0
,
(
cl_float
)
(
cutoff
*
cutoff
));
...
...
@@ -302,8 +302,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void
OpenCLNonbondedUtilities
::
computeInteractions
()
{
if
(
cutoff
!=
-
1.0
)
{
if
(
useCutoff
)
{
forceKernel
.
setArg
<
mm_float4
>
(
1
0
,
context
.
getPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
1
,
context
.
getInvPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
2
,
context
.
getPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
3
,
context
.
getInvPeriodicBoxSize
());
}
context
.
executeKernel
(
forceKernel
,
(
context
.
getNumAtomBlocks
()
*
(
context
.
getNumAtomBlocks
()
+
1
)
/
2
)
*
OpenCLContext
::
TileSize
,
deviceIsCpu
?
1
:
-
1
);
}
...
...
@@ -325,14 +325,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
newSize
=
numTiles
;
delete
interactingTiles
;
interactingTiles
=
new
OpenCLArray
<
mm_ushort2
>
(
context
,
newSize
,
"interactingTiles"
);
forceKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
interactingTiles
->
getDeviceBuffer
());
forceKernel
.
setArg
<
cl_uint
>
(
1
2
,
newSize
);
forceKernel
.
setArg
<
cl
::
Buffer
>
(
10
,
interactingTiles
->
getDeviceBuffer
());
forceKernel
.
setArg
<
cl_uint
>
(
1
4
,
newSize
);
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
interactingTiles
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
9
,
newSize
);
if
(
context
.
getSIMDWidth
()
==
32
||
deviceIsCpu
)
{
delete
interactionFlags
;
interactionFlags
=
new
OpenCLArray
<
cl_uint
>
(
context
,
deviceIsCpu
?
2
*
newSize
:
newSize
,
"interactionFlags"
);
forceKernel
.
setArg
<
cl
::
Buffer
>
(
1
3
,
interactionFlags
->
getDeviceBuffer
());
forceKernel
.
setArg
<
cl
::
Buffer
>
(
1
5
,
interactionFlags
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
interactingTiles
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
...
...
@@ -340,6 +340,19 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
}
}
void
OpenCLNonbondedUtilities
::
setTileRange
(
int
startTileIndex
,
int
numTiles
)
{
this
->
startTileIndex
=
startTileIndex
;
this
->
numTiles
=
numTiles
;
if
(
cutoff
==
-
1.0
)
return
;
// There are no nonbonded interactions in the System.
forceKernel
.
setArg
<
cl_uint
>
(
8
,
startTileIndex
);
forceKernel
.
setArg
<
cl_uint
>
(
9
,
startTileIndex
+
numTiles
);
if
(
useCutoff
)
{
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
10
,
startTileIndex
);
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
11
,
startTileIndex
+
numTiles
);
}
}
cl
::
Kernel
OpenCLNonbondedUtilities
::
createInteractionKernel
(
const
string
&
source
,
const
vector
<
ParameterInfo
>&
params
,
const
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
)
const
{
map
<
string
,
string
>
replacements
;
replacements
[
"COMPUTE_INTERACTION"
]
=
source
;
...
...
@@ -447,8 +460,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines
[
"NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtomBlocks
());
defines
[
"START_TILE_INDEX"
]
=
OpenCLExpressionUtilities
::
intToString
(
startTileIndex
);
defines
[
"END_TILE_INDEX"
]
=
OpenCLExpressionUtilities
::
intToString
(
startTileIndex
+
numTiles
);
string
file
;
if
(
deviceIsCpu
)
file
=
OpenCLKernelSources
::
nonbonded_cpu
;
...
...
@@ -470,6 +481,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusionRowIndices
->
getDeviceBuffer
());
kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
*
localDataSize
:
OpenCLContext
::
ThreadBlockSize
*
localDataSize
),
NULL
);
kernel
.
setArg
(
index
++
,
4
*
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
+
numTiles
);
if
(
useCutoff
)
{
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
interactingTiles
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
interactionCount
->
getDeviceBuffer
());
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.h
View file @
dd352ee5
...
...
@@ -190,6 +190,10 @@ public:
int
getNumTiles
()
const
{
return
numTiles
;
}
/**
* Set the range of tiles that should be processed by this context.
*/
void
setTileRange
(
int
startTileIndex
,
int
numTiles
);
/**
* Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
* are assumed to be the same as those for the default interaction Kernel, since this kernel will use
...
...
platforms/opencl/src/OpenCLParallelKernels.cpp
View file @
dd352ee5
...
...
@@ -29,6 +29,28 @@
using
namespace
OpenMM
;
using
namespace
std
;
/**
* Get the current clock time, measured in microseconds.
*/
#ifdef _MSC_VER
#include <Windows.h>
static
long
getTime
()
{
FILETIME
ft
;
GetSystemTimeAsFileTime
(
&
ft
);
// 100-nanoseconds since 1-1-1601
ULARGE_INTEGER
result
;
result
.
LowPart
=
ft
.
dwLowDateTime
;
result
.
HighPart
=
ft
.
dwHighDateTime
;
return
result
/
10
;
}
#else
#include <sys/time.h>
static
long
getTime
()
{
struct
timeval
tod
;
gettimeofday
(
&
tod
,
0
);
return
1000000
*
tod
.
tv_sec
+
tod
.
tv_usec
;
}
#endif
class
OpenCLParallelCalcForcesAndEnergyKernel
::
BeginComputationTask
:
public
OpenCLContext
::
WorkTask
{
public:
BeginComputationTask
(
ContextImpl
&
context
,
OpenCLContext
&
cl
,
OpenCLCalcForcesAndEnergyKernel
&
kernel
,
...
...
@@ -52,8 +74,8 @@ private:
class
OpenCLParallelCalcForcesAndEnergyKernel
::
FinishComputationTask
:
public
OpenCLContext
::
WorkTask
{
public:
FinishComputationTask
(
ContextImpl
&
context
,
OpenCLContext
&
cl
,
OpenCLCalcForcesAndEnergyKernel
&
kernel
,
bool
includeForce
,
bool
includeEnergy
,
double
&
energy
)
:
context
(
context
),
cl
(
cl
),
kernel
(
kernel
),
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
energy
(
energy
)
{
bool
includeForce
,
bool
includeEnergy
,
double
&
energy
,
long
&
completionTime
)
:
context
(
context
),
cl
(
cl
),
kernel
(
kernel
),
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
energy
(
energy
)
,
completionTime
(
completionTime
)
{
}
void
execute
()
{
// Execute the kernel, then download forces.
...
...
@@ -61,7 +83,7 @@ public:
energy
+=
kernel
.
finishComputation
(
context
,
includeForce
,
includeEnergy
);
if
(
includeForce
)
cl
.
getForce
().
download
();
mm_float4
f
=
cl
.
getForce
()[
0
]
;
completionTime
=
getTime
()
;
}
private:
ContextImpl
&
context
;
...
...
@@ -69,10 +91,11 @@ private:
OpenCLCalcForcesAndEnergyKernel
&
kernel
;
bool
includeForce
,
includeEnergy
;
double
&
energy
;
long
&
completionTime
;
};
OpenCLParallelCalcForcesAndEnergyKernel
::
OpenCLParallelCalcForcesAndEnergyKernel
(
string
name
,
const
Platform
&
platform
,
OpenCLPlatform
::
PlatformData
&
data
)
:
CalcForcesAndEnergyKernel
(
name
,
platform
),
data
(
data
)
{
CalcForcesAndEnergyKernel
(
name
,
platform
),
data
(
data
)
,
completionTimes
(
data
.
contexts
.
size
()),
contextTiles
(
data
.
contexts
.
size
())
{
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
kernels
.
push_back
(
Kernel
(
new
OpenCLCalcForcesAndEnergyKernel
(
name
,
platform
,
*
data
.
contexts
[
i
])));
}
...
...
@@ -98,7 +121,7 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
{
OpenCLContext
&
cl
=
*
data
.
contexts
[
i
];
OpenCLContext
::
WorkThread
&
thread
=
cl
.
getWorkThread
();
thread
.
addTask
(
new
FinishComputationTask
(
context
,
cl
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
data
.
contextEnergy
[
i
]));
thread
.
addTask
(
new
FinishComputationTask
(
context
,
cl
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
data
.
contextEnergy
[
i
]
,
completionTimes
[
i
]
));
}
data
.
syncContexts
();
double
energy
=
0.0
;
...
...
@@ -107,8 +130,6 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
if
(
includeForce
)
{
// Sum the forces from all devices.
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
data
.
contexts
[
i
]
->
getForce
().
download
();
OpenCLArray
<
mm_float4
>&
forces
=
data
.
contexts
[
0
]
->
getForce
();
for
(
int
i
=
1
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
{
OpenCLArray
<
mm_float4
>&
contextForces
=
data
.
contexts
[
i
]
->
getForce
();
...
...
@@ -121,6 +142,32 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
}
}
forces
.
upload
();
// Balance work between the contexts by transferring a few nonbonded tiles from the context that
// finished last to the one that finished first.
int
firstIndex
=
0
,
lastIndex
=
0
;
int
totalTiles
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
completionTimes
.
size
();
i
++
)
{
if
(
completionTimes
[
i
]
<
completionTimes
[
firstIndex
])
firstIndex
=
i
;
if
(
completionTimes
[
i
]
>
completionTimes
[
lastIndex
])
lastIndex
=
i
;
contextTiles
[
i
]
=
data
.
contexts
[
i
]
->
getNonbondedUtilities
().
getNumTiles
();
totalTiles
+=
contextTiles
[
i
];
}
int
tilesToTransfer
=
totalTiles
/
1000
;
if
(
tilesToTransfer
<
1
)
tilesToTransfer
=
1
;
if
(
tilesToTransfer
>
contextTiles
[
lastIndex
])
tilesToTransfer
=
contextTiles
[
lastIndex
];
contextTiles
[
firstIndex
]
+=
tilesToTransfer
;
contextTiles
[
lastIndex
]
-=
tilesToTransfer
;
int
startIndex
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
contextTiles
.
size
();
i
++
)
{
data
.
contexts
[
i
]
->
getNonbondedUtilities
().
setTileRange
(
startIndex
,
contextTiles
[
i
]);
startIndex
+=
contextTiles
[
i
];
}
}
return
energy
;
}
...
...
platforms/opencl/src/OpenCLParallelKernels.h
View file @
dd352ee5
...
...
@@ -76,6 +76,8 @@ private:
class
FinishComputationTask
;
OpenCLPlatform
::
PlatformData
&
data
;
std
::
vector
<
Kernel
>
kernels
;
std
::
vector
<
long
>
completionTimes
;
std
::
vector
<
int
>
contextTiles
;
};
/**
...
...
platforms/opencl/src/kernels/findInteractingBlocks.cl
View file @
dd352ee5
...
...
@@ -159,7 +159,8 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
*/
__kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* blockCenter,
__global float4* blockBoundingBox, __global unsigned int* interactionCount, __global ushort2* interactingTiles,
__global unsigned int* interactionFlags, __global float4* posq, unsigned int maxTiles) {
__global unsigned int* interactionFlags, __global float4* posq, unsigned int maxTiles, unsigned int startTileIndex,
unsigned int endTileIndex) {
__local ushort2 buffer[BUFFER_SIZE];
__local int valid[BUFFER_SIZE];
__local short sum[BUFFER_SIZE];
...
...
@@ -172,11 +173,11 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
for (int i = 0; i < BUFFER_GROUPS; ++i)
valid[i*GROUP_SIZE+get_local_id(0)] = false;
barrier(CLK_LOCAL_MEM_FENCE);
for (int baseIndex =
START_TILE_INDEX
+get_group_id(0)*get_local_size(0); baseIndex <
END_TILE_INDEX
; baseIndex += get_global_size(0)) {
for (int baseIndex =
startTileIndex
+get_group_id(0)*get_local_size(0); baseIndex <
endTileIndex
; baseIndex += get_global_size(0)) {
// Identify the pair of blocks to compare.
int index = baseIndex+get_local_id(0);
if (index <
END_TILE_INDEX
) {
if (index <
endTileIndex
) {
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*index));
unsigned int x = (index-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y |
|
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
...
...
platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
View file @
dd352ee5
...
...
@@ -123,12 +123,13 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int*
*/
__kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* blockCenter,
__global float4* blockBoundingBox, __global unsigned int* interactionCount, __global ushort2* interactingTiles,
__global unsigned int* interactionFlags, __global float4* posq, unsigned int maxTiles) {
__global unsigned int* interactionFlags, __global float4* posq, unsigned int maxTiles, unsigned int startTileIndex,
unsigned int endTileIndex) {
ushort2 buffer[BUFFER_SIZE];
int valuesInBuffer = 0;
const int numTiles =
END_TILE_INDEX-START_TILE_INDEX
;
unsigned int start =
START_TILE_INDEX
+get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end =
START_TILE_INDEX
+(get_group_id(0)+1)*numTiles/get_num_groups(0);
const int numTiles =
endTileIndex-startTileIndex
;
unsigned int start =
startTileIndex
+get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end =
startTileIndex
+(get_group_id(0)+1)*numTiles/get_num_groups(0);
for (int index = start; index < end; index++) {
// Identify the pair of blocks to compare.
...
...
platforms/opencl/src/kernels/nonbonded_cpu.cl
View file @
dd352ee5
...
...
@@ -13,6 +13,7 @@ typedef struct {
__kernel
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
...
...
@@ -21,11 +22,11 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
PARAMETER_ARGUMENTS
)
{
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
(
numTiles
>
maxTiles
?
START_TILE_INDEX
+get_group_id
(
0
)
*
(
END_TILE_INDEX-START_TILE_INDEX
)
/get_num_groups
(
0
)
:
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
))
;
unsigned
int
end
=
(
numTiles
>
maxTiles
?
START_TILE_INDEX
+
(
get_group_id
(
0
)
+1
)
*
(
END_TILE_INDEX-START_TILE_INDEX
)
/get_num_groups
(
0
)
:
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
))
;
unsigned
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+get_group_id
(
0
)
*
(
endTileIndex-startTileIndex
)
/get_num_groups
(
0
)
:
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
))
;
unsigned
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex
+
(
get_group_id
(
0
)
+1
)
*
(
endTileIndex-startTileIndex
)
/get_num_groups
(
0
)
:
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
))
;
#
else
unsigned
int
pos
=
START_TILE_INDEX
+get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
START_TILE_INDEX
+
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
pos
=
startTileIndex
+get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
startTileIndex
+
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
...
...
platforms/opencl/src/kernels/nonbonded_default.cl
View file @
dd352ee5
...
...
@@ -14,6 +14,7 @@ typedef struct {
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
...
...
@@ -22,11 +23,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
PARAMETER_ARGUMENTS
)
{
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
(
numTiles
>
maxTiles
?
START_TILE_INDEX
+get_group_id
(
0
)
*
(
END_TILE_INDEX-START_TILE_INDEX
)
/get_num_groups
(
0
)
:
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
))
;
unsigned
int
end
=
(
numTiles
>
maxTiles
?
START_TILE_INDEX
+
(
get_group_id
(
0
)
+1
)
*
(
END_TILE_INDEX-START_TILE_INDEX
)
/get_num_groups
(
0
)
:
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
))
;
unsigned
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+get_group_id
(
0
)
*
(
endTileIndex-startTileIndex
)
/get_num_groups
(
0
)
:
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
))
;
unsigned
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex
+
(
get_group_id
(
0
)
+1
)
*
(
endTileIndex-startTileIndex
)
/get_num_groups
(
0
)
:
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
))
;
#
else
unsigned
int
pos
=
START_TILE_INDEX
+get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
START_TILE_INDEX
+
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
pos
=
startTileIndex
+get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
startTileIndex
+
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
...
...
platforms/opencl/src/kernels/nonbonded_nvidia.cl
View file @
dd352ee5
...
...
@@ -14,6 +14,7 @@ typedef struct {
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float*
tempBuffer,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
...
...
@@ -24,11 +25,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
;
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
(
numTiles
>
maxTiles
?
START_TILE_INDEX+warp*
(
END_TILE_INDEX-START_TILE_INDEX
)
/totalWarps
:
warp*numTiles/totalWarps
)
;
unsigned
int
end
=
(
numTiles
>
maxTiles
?
START_TILE_INDEX+
(
warp+1
)
*
(
END_TILE_INDEX-START_TILE_INDEX
)
/totalWarps
:
(
warp+1
)
*numTiles/totalWarps
)
;
unsigned
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex+warp*
(
endTileIndex-startTileIndex
)
/totalWarps
:
warp*numTiles/totalWarps
)
;
unsigned
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex+
(
warp+1
)
*
(
endTileIndex-startTileIndex
)
/totalWarps
:
(
warp+1
)
*numTiles/totalWarps
)
;
#
else
unsigned
int
pos
=
START_TILE_INDEX
+warp*numTiles/totalWarps
;
unsigned
int
end
=
START_TILE_INDEX
+
(
warp+1
)
*numTiles/totalWarps
;
unsigned
int
pos
=
startTileIndex
+warp*numTiles/totalWarps
;
unsigned
int
end
=
startTileIndex
+
(
warp+1
)
*numTiles/totalWarps
;
#
endif
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment