Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
dd352ee5
Commit
dd352ee5
authored
Apr 26, 2011
by
Peter Eastman
Browse files
Added dynamic load balancing between GPUs
parent
6e3526b4
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
106 additions
and
35 deletions
+106
-35
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+22
-9
platforms/opencl/src/OpenCLNonbondedUtilities.h
platforms/opencl/src/OpenCLNonbondedUtilities.h
+4
-0
platforms/opencl/src/OpenCLParallelKernels.cpp
platforms/opencl/src/OpenCLParallelKernels.cpp
+54
-7
platforms/opencl/src/OpenCLParallelKernels.h
platforms/opencl/src/OpenCLParallelKernels.h
+2
-0
platforms/opencl/src/kernels/findInteractingBlocks.cl
platforms/opencl/src/kernels/findInteractingBlocks.cl
+4
-3
platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
+5
-4
platforms/opencl/src/kernels/nonbonded_cpu.cl
platforms/opencl/src/kernels/nonbonded_cpu.cl
+5
-4
platforms/opencl/src/kernels/nonbonded_default.cl
platforms/opencl/src/kernels/nonbonded_default.cl
+5
-4
platforms/opencl/src/kernels/nonbonded_nvidia.cl
platforms/opencl/src/kernels/nonbonded_nvidia.cl
+5
-4
No files found.
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
dd352ee5
...
@@ -233,8 +233,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
...
@@ -233,8 +233,6 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
if
(
useCutoff
)
{
if
(
useCutoff
)
{
map
<
string
,
string
>
defines
;
map
<
string
,
string
>
defines
;
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtomBlocks
());
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtomBlocks
());
defines
[
"START_TILE_INDEX"
]
=
OpenCLExpressionUtilities
::
intToString
(
startTileIndex
);
defines
[
"END_TILE_INDEX"
]
=
OpenCLExpressionUtilities
::
intToString
(
startTileIndex
+
numTiles
);
if
(
forceBufferPerAtomBlock
)
if
(
forceBufferPerAtomBlock
)
defines
[
"USE_OUTPUT_BUFFER_PER_BLOCK"
]
=
"1"
;
defines
[
"USE_OUTPUT_BUFFER_PER_BLOCK"
]
=
"1"
;
if
(
usePeriodic
)
if
(
usePeriodic
)
...
@@ -256,6 +254,8 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
...
@@ -256,6 +254,8 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
context
.
getPosq
().
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
context
.
getPosq
().
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
9
,
interactingTiles
->
getSize
());
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
9
,
interactingTiles
->
getSize
());
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
10
,
startTileIndex
);
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
11
,
startTileIndex
+
numTiles
);
if
(
context
.
getSIMDWidth
()
==
32
&&
!
deviceIsCpu
)
{
if
(
context
.
getSIMDWidth
()
==
32
&&
!
deviceIsCpu
)
{
findInteractionsWithinBlocksKernel
=
cl
::
Kernel
(
interactingBlocksProgram
,
"findInteractionsWithinBlocks"
);
findInteractionsWithinBlocksKernel
=
cl
::
Kernel
(
interactingBlocksProgram
,
"findInteractionsWithinBlocks"
);
findInteractionsWithinBlocksKernel
.
setArg
<
cl_float
>
(
0
,
(
cl_float
)
(
cutoff
*
cutoff
));
findInteractionsWithinBlocksKernel
.
setArg
<
cl_float
>
(
0
,
(
cl_float
)
(
cutoff
*
cutoff
));
...
@@ -302,8 +302,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
...
@@ -302,8 +302,8 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
void
OpenCLNonbondedUtilities
::
computeInteractions
()
{
void
OpenCLNonbondedUtilities
::
computeInteractions
()
{
if
(
cutoff
!=
-
1.0
)
{
if
(
cutoff
!=
-
1.0
)
{
if
(
useCutoff
)
{
if
(
useCutoff
)
{
forceKernel
.
setArg
<
mm_float4
>
(
1
0
,
context
.
getPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
2
,
context
.
getPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
1
,
context
.
getInvPeriodicBoxSize
());
forceKernel
.
setArg
<
mm_float4
>
(
1
3
,
context
.
getInvPeriodicBoxSize
());
}
}
context
.
executeKernel
(
forceKernel
,
(
context
.
getNumAtomBlocks
()
*
(
context
.
getNumAtomBlocks
()
+
1
)
/
2
)
*
OpenCLContext
::
TileSize
,
deviceIsCpu
?
1
:
-
1
);
context
.
executeKernel
(
forceKernel
,
(
context
.
getNumAtomBlocks
()
*
(
context
.
getNumAtomBlocks
()
+
1
)
/
2
)
*
OpenCLContext
::
TileSize
,
deviceIsCpu
?
1
:
-
1
);
}
}
...
@@ -325,14 +325,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
...
@@ -325,14 +325,14 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
newSize
=
numTiles
;
newSize
=
numTiles
;
delete
interactingTiles
;
delete
interactingTiles
;
interactingTiles
=
new
OpenCLArray
<
mm_ushort2
>
(
context
,
newSize
,
"interactingTiles"
);
interactingTiles
=
new
OpenCLArray
<
mm_ushort2
>
(
context
,
newSize
,
"interactingTiles"
);
forceKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
interactingTiles
->
getDeviceBuffer
());
forceKernel
.
setArg
<
cl
::
Buffer
>
(
10
,
interactingTiles
->
getDeviceBuffer
());
forceKernel
.
setArg
<
cl_uint
>
(
1
2
,
newSize
);
forceKernel
.
setArg
<
cl_uint
>
(
1
4
,
newSize
);
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
interactingTiles
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
interactingTiles
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
9
,
newSize
);
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
9
,
newSize
);
if
(
context
.
getSIMDWidth
()
==
32
||
deviceIsCpu
)
{
if
(
context
.
getSIMDWidth
()
==
32
||
deviceIsCpu
)
{
delete
interactionFlags
;
delete
interactionFlags
;
interactionFlags
=
new
OpenCLArray
<
cl_uint
>
(
context
,
deviceIsCpu
?
2
*
newSize
:
newSize
,
"interactionFlags"
);
interactionFlags
=
new
OpenCLArray
<
cl_uint
>
(
context
,
deviceIsCpu
?
2
*
newSize
:
newSize
,
"interactionFlags"
);
forceKernel
.
setArg
<
cl
::
Buffer
>
(
1
3
,
interactionFlags
->
getDeviceBuffer
());
forceKernel
.
setArg
<
cl
::
Buffer
>
(
1
5
,
interactionFlags
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
interactingTiles
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
interactingTiles
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionFlags
->
getDeviceBuffer
());
...
@@ -340,6 +340,19 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
...
@@ -340,6 +340,19 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
}
}
}
}
void
OpenCLNonbondedUtilities
::
setTileRange
(
int
startTileIndex
,
int
numTiles
)
{
this
->
startTileIndex
=
startTileIndex
;
this
->
numTiles
=
numTiles
;
if
(
cutoff
==
-
1.0
)
return
;
// There are no nonbonded interactions in the System.
forceKernel
.
setArg
<
cl_uint
>
(
8
,
startTileIndex
);
forceKernel
.
setArg
<
cl_uint
>
(
9
,
startTileIndex
+
numTiles
);
if
(
useCutoff
)
{
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
10
,
startTileIndex
);
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
11
,
startTileIndex
+
numTiles
);
}
}
cl
::
Kernel
OpenCLNonbondedUtilities
::
createInteractionKernel
(
const
string
&
source
,
const
vector
<
ParameterInfo
>&
params
,
const
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
)
const
{
cl
::
Kernel
OpenCLNonbondedUtilities
::
createInteractionKernel
(
const
string
&
source
,
const
vector
<
ParameterInfo
>&
params
,
const
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
)
const
{
map
<
string
,
string
>
replacements
;
map
<
string
,
string
>
replacements
;
replacements
[
"COMPUTE_INTERACTION"
]
=
source
;
replacements
[
"COMPUTE_INTERACTION"
]
=
source
;
...
@@ -447,8 +460,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
...
@@ -447,8 +460,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines
[
"NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtoms
());
defines
[
"NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getPaddedNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtomBlocks
());
defines
[
"NUM_BLOCKS"
]
=
OpenCLExpressionUtilities
::
intToString
(
context
.
getNumAtomBlocks
());
defines
[
"START_TILE_INDEX"
]
=
OpenCLExpressionUtilities
::
intToString
(
startTileIndex
);
defines
[
"END_TILE_INDEX"
]
=
OpenCLExpressionUtilities
::
intToString
(
startTileIndex
+
numTiles
);
string
file
;
string
file
;
if
(
deviceIsCpu
)
if
(
deviceIsCpu
)
file
=
OpenCLKernelSources
::
nonbonded_cpu
;
file
=
OpenCLKernelSources
::
nonbonded_cpu
;
...
@@ -470,6 +481,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
...
@@ -470,6 +481,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusionRowIndices
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
exclusionRowIndices
->
getDeviceBuffer
());
kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
*
localDataSize
:
OpenCLContext
::
ThreadBlockSize
*
localDataSize
),
NULL
);
kernel
.
setArg
(
index
++
,
(
deviceIsCpu
?
OpenCLContext
::
TileSize
*
localDataSize
:
OpenCLContext
::
ThreadBlockSize
*
localDataSize
),
NULL
);
kernel
.
setArg
(
index
++
,
4
*
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
kernel
.
setArg
(
index
++
,
4
*
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_float
),
NULL
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
);
kernel
.
setArg
<
cl_uint
>
(
index
++
,
startTileIndex
+
numTiles
);
if
(
useCutoff
)
{
if
(
useCutoff
)
{
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
interactingTiles
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
interactingTiles
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
interactionCount
->
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
interactionCount
->
getDeviceBuffer
());
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.h
View file @
dd352ee5
...
@@ -190,6 +190,10 @@ public:
...
@@ -190,6 +190,10 @@ public:
int
getNumTiles
()
const
{
int
getNumTiles
()
const
{
return
numTiles
;
return
numTiles
;
}
}
/**
* Set the range of tiles that should be processed by this context.
*/
void
setTileRange
(
int
startTileIndex
,
int
numTiles
);
/**
/**
* Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
* Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
* are assumed to be the same as those for the default interaction Kernel, since this kernel will use
* are assumed to be the same as those for the default interaction Kernel, since this kernel will use
...
...
platforms/opencl/src/OpenCLParallelKernels.cpp
View file @
dd352ee5
...
@@ -29,6 +29,28 @@
...
@@ -29,6 +29,28 @@
using
namespace
OpenMM
;
using
namespace
OpenMM
;
using
namespace
std
;
using
namespace
std
;
/**
* Get the current clock time, measured in microseconds.
*/
#ifdef _MSC_VER
#include <Windows.h>
static
long
getTime
()
{
FILETIME
ft
;
GetSystemTimeAsFileTime
(
&
ft
);
// 100-nanoseconds since 1-1-1601
ULARGE_INTEGER
result
;
result
.
LowPart
=
ft
.
dwLowDateTime
;
result
.
HighPart
=
ft
.
dwHighDateTime
;
return
result
/
10
;
}
#else
#include <sys/time.h>
static
long
getTime
()
{
struct
timeval
tod
;
gettimeofday
(
&
tod
,
0
);
return
1000000
*
tod
.
tv_sec
+
tod
.
tv_usec
;
}
#endif
class
OpenCLParallelCalcForcesAndEnergyKernel
::
BeginComputationTask
:
public
OpenCLContext
::
WorkTask
{
class
OpenCLParallelCalcForcesAndEnergyKernel
::
BeginComputationTask
:
public
OpenCLContext
::
WorkTask
{
public:
public:
BeginComputationTask
(
ContextImpl
&
context
,
OpenCLContext
&
cl
,
OpenCLCalcForcesAndEnergyKernel
&
kernel
,
BeginComputationTask
(
ContextImpl
&
context
,
OpenCLContext
&
cl
,
OpenCLCalcForcesAndEnergyKernel
&
kernel
,
...
@@ -52,8 +74,8 @@ private:
...
@@ -52,8 +74,8 @@ private:
class
OpenCLParallelCalcForcesAndEnergyKernel
::
FinishComputationTask
:
public
OpenCLContext
::
WorkTask
{
class
OpenCLParallelCalcForcesAndEnergyKernel
::
FinishComputationTask
:
public
OpenCLContext
::
WorkTask
{
public:
public:
FinishComputationTask
(
ContextImpl
&
context
,
OpenCLContext
&
cl
,
OpenCLCalcForcesAndEnergyKernel
&
kernel
,
FinishComputationTask
(
ContextImpl
&
context
,
OpenCLContext
&
cl
,
OpenCLCalcForcesAndEnergyKernel
&
kernel
,
bool
includeForce
,
bool
includeEnergy
,
double
&
energy
)
:
context
(
context
),
cl
(
cl
),
kernel
(
kernel
),
bool
includeForce
,
bool
includeEnergy
,
double
&
energy
,
long
&
completionTime
)
:
context
(
context
),
cl
(
cl
),
kernel
(
kernel
),
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
energy
(
energy
)
{
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
energy
(
energy
)
,
completionTime
(
completionTime
)
{
}
}
void
execute
()
{
void
execute
()
{
// Execute the kernel, then download forces.
// Execute the kernel, then download forces.
...
@@ -61,7 +83,7 @@ public:
...
@@ -61,7 +83,7 @@ public:
energy
+=
kernel
.
finishComputation
(
context
,
includeForce
,
includeEnergy
);
energy
+=
kernel
.
finishComputation
(
context
,
includeForce
,
includeEnergy
);
if
(
includeForce
)
if
(
includeForce
)
cl
.
getForce
().
download
();
cl
.
getForce
().
download
();
mm_float4
f
=
cl
.
getForce
()[
0
]
;
completionTime
=
getTime
()
;
}
}
private:
private:
ContextImpl
&
context
;
ContextImpl
&
context
;
...
@@ -69,10 +91,11 @@ private:
...
@@ -69,10 +91,11 @@ private:
OpenCLCalcForcesAndEnergyKernel
&
kernel
;
OpenCLCalcForcesAndEnergyKernel
&
kernel
;
bool
includeForce
,
includeEnergy
;
bool
includeForce
,
includeEnergy
;
double
&
energy
;
double
&
energy
;
long
&
completionTime
;
};
};
OpenCLParallelCalcForcesAndEnergyKernel
::
OpenCLParallelCalcForcesAndEnergyKernel
(
string
name
,
const
Platform
&
platform
,
OpenCLPlatform
::
PlatformData
&
data
)
:
OpenCLParallelCalcForcesAndEnergyKernel
::
OpenCLParallelCalcForcesAndEnergyKernel
(
string
name
,
const
Platform
&
platform
,
OpenCLPlatform
::
PlatformData
&
data
)
:
CalcForcesAndEnergyKernel
(
name
,
platform
),
data
(
data
)
{
CalcForcesAndEnergyKernel
(
name
,
platform
),
data
(
data
)
,
completionTimes
(
data
.
contexts
.
size
()),
contextTiles
(
data
.
contexts
.
size
())
{
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
kernels
.
push_back
(
Kernel
(
new
OpenCLCalcForcesAndEnergyKernel
(
name
,
platform
,
*
data
.
contexts
[
i
])));
kernels
.
push_back
(
Kernel
(
new
OpenCLCalcForcesAndEnergyKernel
(
name
,
platform
,
*
data
.
contexts
[
i
])));
}
}
...
@@ -98,7 +121,7 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
...
@@ -98,7 +121,7 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
{
OpenCLContext
&
cl
=
*
data
.
contexts
[
i
];
OpenCLContext
&
cl
=
*
data
.
contexts
[
i
];
OpenCLContext
::
WorkThread
&
thread
=
cl
.
getWorkThread
();
OpenCLContext
::
WorkThread
&
thread
=
cl
.
getWorkThread
();
thread
.
addTask
(
new
FinishComputationTask
(
context
,
cl
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
data
.
contextEnergy
[
i
]));
thread
.
addTask
(
new
FinishComputationTask
(
context
,
cl
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
data
.
contextEnergy
[
i
]
,
completionTimes
[
i
]
));
}
}
data
.
syncContexts
();
data
.
syncContexts
();
double
energy
=
0.0
;
double
energy
=
0.0
;
...
@@ -107,8 +130,6 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
...
@@ -107,8 +130,6 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
if
(
includeForce
)
{
if
(
includeForce
)
{
// Sum the forces from all devices.
// Sum the forces from all devices.
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
data
.
contexts
[
i
]
->
getForce
().
download
();
OpenCLArray
<
mm_float4
>&
forces
=
data
.
contexts
[
0
]
->
getForce
();
OpenCLArray
<
mm_float4
>&
forces
=
data
.
contexts
[
0
]
->
getForce
();
for
(
int
i
=
1
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
{
for
(
int
i
=
1
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
{
OpenCLArray
<
mm_float4
>&
contextForces
=
data
.
contexts
[
i
]
->
getForce
();
OpenCLArray
<
mm_float4
>&
contextForces
=
data
.
contexts
[
i
]
->
getForce
();
...
@@ -121,6 +142,32 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
...
@@ -121,6 +142,32 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
}
}
}
}
forces
.
upload
();
forces
.
upload
();
// Balance work between the contexts by transferring a few nonbonded tiles from the context that
// finished last to the one that finished first.
int
firstIndex
=
0
,
lastIndex
=
0
;
int
totalTiles
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
completionTimes
.
size
();
i
++
)
{
if
(
completionTimes
[
i
]
<
completionTimes
[
firstIndex
])
firstIndex
=
i
;
if
(
completionTimes
[
i
]
>
completionTimes
[
lastIndex
])
lastIndex
=
i
;
contextTiles
[
i
]
=
data
.
contexts
[
i
]
->
getNonbondedUtilities
().
getNumTiles
();
totalTiles
+=
contextTiles
[
i
];
}
int
tilesToTransfer
=
totalTiles
/
1000
;
if
(
tilesToTransfer
<
1
)
tilesToTransfer
=
1
;
if
(
tilesToTransfer
>
contextTiles
[
lastIndex
])
tilesToTransfer
=
contextTiles
[
lastIndex
];
contextTiles
[
firstIndex
]
+=
tilesToTransfer
;
contextTiles
[
lastIndex
]
-=
tilesToTransfer
;
int
startIndex
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
contextTiles
.
size
();
i
++
)
{
data
.
contexts
[
i
]
->
getNonbondedUtilities
().
setTileRange
(
startIndex
,
contextTiles
[
i
]);
startIndex
+=
contextTiles
[
i
];
}
}
}
return
energy
;
return
energy
;
}
}
...
...
platforms/opencl/src/OpenCLParallelKernels.h
View file @
dd352ee5
...
@@ -76,6 +76,8 @@ private:
...
@@ -76,6 +76,8 @@ private:
class
FinishComputationTask
;
class
FinishComputationTask
;
OpenCLPlatform
::
PlatformData
&
data
;
OpenCLPlatform
::
PlatformData
&
data
;
std
::
vector
<
Kernel
>
kernels
;
std
::
vector
<
Kernel
>
kernels
;
std
::
vector
<
long
>
completionTimes
;
std
::
vector
<
int
>
contextTiles
;
};
};
/**
/**
...
...
platforms/opencl/src/kernels/findInteractingBlocks.cl
View file @
dd352ee5
...
@@ -159,7 +159,8 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
...
@@ -159,7 +159,8 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
*/
*/
__kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* blockCenter,
__kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* blockCenter,
__global float4* blockBoundingBox, __global unsigned int* interactionCount, __global ushort2* interactingTiles,
__global float4* blockBoundingBox, __global unsigned int* interactionCount, __global ushort2* interactingTiles,
__global unsigned int* interactionFlags, __global float4* posq, unsigned int maxTiles) {
__global unsigned int* interactionFlags, __global float4* posq, unsigned int maxTiles, unsigned int startTileIndex,
unsigned int endTileIndex) {
__local ushort2 buffer[BUFFER_SIZE];
__local ushort2 buffer[BUFFER_SIZE];
__local int valid[BUFFER_SIZE];
__local int valid[BUFFER_SIZE];
__local short sum[BUFFER_SIZE];
__local short sum[BUFFER_SIZE];
...
@@ -172,11 +173,11 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
...
@@ -172,11 +173,11 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
for (int i = 0; i < BUFFER_GROUPS; ++i)
for (int i = 0; i < BUFFER_GROUPS; ++i)
valid[i*GROUP_SIZE+get_local_id(0)] = false;
valid[i*GROUP_SIZE+get_local_id(0)] = false;
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
for (int baseIndex =
START_TILE_INDEX
+get_group_id(0)*get_local_size(0); baseIndex <
END_TILE_INDEX
; baseIndex += get_global_size(0)) {
for (int baseIndex =
startTileIndex
+get_group_id(0)*get_local_size(0); baseIndex <
endTileIndex
; baseIndex += get_global_size(0)) {
// Identify the pair of blocks to compare.
// Identify the pair of blocks to compare.
int index = baseIndex+get_local_id(0);
int index = baseIndex+get_local_id(0);
if (index <
END_TILE_INDEX
) {
if (index <
endTileIndex
) {
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*index));
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*index));
unsigned int x = (index-y*NUM_BLOCKS+y*(y+1)/2);
unsigned int x = (index-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y |
|
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
if (x < y |
|
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
...
...
platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
View file @
dd352ee5
...
@@ -123,12 +123,13 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int*
...
@@ -123,12 +123,13 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int*
*/
*/
__kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* blockCenter,
__kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global float4* blockCenter,
__global float4* blockBoundingBox, __global unsigned int* interactionCount, __global ushort2* interactingTiles,
__global float4* blockBoundingBox, __global unsigned int* interactionCount, __global ushort2* interactingTiles,
__global unsigned int* interactionFlags, __global float4* posq, unsigned int maxTiles) {
__global unsigned int* interactionFlags, __global float4* posq, unsigned int maxTiles, unsigned int startTileIndex,
unsigned int endTileIndex) {
ushort2 buffer[BUFFER_SIZE];
ushort2 buffer[BUFFER_SIZE];
int valuesInBuffer = 0;
int valuesInBuffer = 0;
const int numTiles =
END_TILE_INDEX-START_TILE_INDEX
;
const int numTiles =
endTileIndex-startTileIndex
;
unsigned int start =
START_TILE_INDEX
+get_group_id(0)*numTiles/get_num_groups(0);
unsigned int start =
startTileIndex
+get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end =
START_TILE_INDEX
+(get_group_id(0)+1)*numTiles/get_num_groups(0);
unsigned int end =
startTileIndex
+(get_group_id(0)+1)*numTiles/get_num_groups(0);
for (int index = start; index < end; index++) {
for (int index = start; index < end; index++) {
// Identify the pair of blocks to compare.
// Identify the pair of blocks to compare.
...
...
platforms/opencl/src/kernels/nonbonded_cpu.cl
View file @
dd352ee5
...
@@ -13,6 +13,7 @@ typedef struct {
...
@@ -13,6 +13,7 @@ typedef struct {
__kernel
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__kernel
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
#
else
...
@@ -21,11 +22,11 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
...
@@ -21,11 +22,11 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
PARAMETER_ARGUMENTS
)
{
PARAMETER_ARGUMENTS
)
{
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
(
numTiles
>
maxTiles
?
START_TILE_INDEX
+get_group_id
(
0
)
*
(
END_TILE_INDEX-START_TILE_INDEX
)
/get_num_groups
(
0
)
:
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
))
;
unsigned
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+get_group_id
(
0
)
*
(
endTileIndex-startTileIndex
)
/get_num_groups
(
0
)
:
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
))
;
unsigned
int
end
=
(
numTiles
>
maxTiles
?
START_TILE_INDEX
+
(
get_group_id
(
0
)
+1
)
*
(
END_TILE_INDEX-START_TILE_INDEX
)
/get_num_groups
(
0
)
:
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
))
;
unsigned
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex
+
(
get_group_id
(
0
)
+1
)
*
(
endTileIndex-startTileIndex
)
/get_num_groups
(
0
)
:
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
))
;
#
else
#
else
unsigned
int
pos
=
START_TILE_INDEX
+get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
pos
=
startTileIndex
+get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
START_TILE_INDEX
+
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
startTileIndex
+
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
#
endif
float
energy
=
0.0f
;
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
...
...
platforms/opencl/src/kernels/nonbonded_default.cl
View file @
dd352ee5
...
@@ -14,6 +14,7 @@ typedef struct {
...
@@ -14,6 +14,7 @@ typedef struct {
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float4*
tempBuffer,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
#
else
...
@@ -22,11 +23,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
...
@@ -22,11 +23,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
PARAMETER_ARGUMENTS
)
{
PARAMETER_ARGUMENTS
)
{
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
(
numTiles
>
maxTiles
?
START_TILE_INDEX
+get_group_id
(
0
)
*
(
END_TILE_INDEX-START_TILE_INDEX
)
/get_num_groups
(
0
)
:
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
))
;
unsigned
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex
+get_group_id
(
0
)
*
(
endTileIndex-startTileIndex
)
/get_num_groups
(
0
)
:
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
))
;
unsigned
int
end
=
(
numTiles
>
maxTiles
?
START_TILE_INDEX
+
(
get_group_id
(
0
)
+1
)
*
(
END_TILE_INDEX-START_TILE_INDEX
)
/get_num_groups
(
0
)
:
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
))
;
unsigned
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex
+
(
get_group_id
(
0
)
+1
)
*
(
endTileIndex-startTileIndex
)
/get_num_groups
(
0
)
:
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
))
;
#
else
#
else
unsigned
int
pos
=
START_TILE_INDEX
+get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
pos
=
startTileIndex
+get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
START_TILE_INDEX
+
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
startTileIndex
+
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
#
endif
float
energy
=
0.0f
;
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
...
...
platforms/opencl/src/kernels/nonbonded_nvidia.cl
View file @
dd352ee5
...
@@ -14,6 +14,7 @@ typedef struct {
...
@@ -14,6 +14,7 @@ typedef struct {
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
void
computeNonbonded
(
__global
float4*
forceBuffers,
__global
float*
energyBuffer,
__global
float4*
posq,
__global
unsigned
int*
exclusions,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float*
tempBuffer,
__global
unsigned
int*
exclusionIndices,
__global
unsigned
int*
exclusionRowIndices,
__local
AtomData*
localData,
__local
float*
tempBuffer,
unsigned
int
startTileIndex,
unsigned
int
endTileIndex,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
__global
ushort2*
tiles,
__global
unsigned
int*
interactionCount,
float4
periodicBoxSize,
float4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
unsigned
int*
interactionFlags
#
else
#
else
...
@@ -24,11 +25,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
...
@@ -24,11 +25,11 @@ void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffe
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
;
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
;
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
(
numTiles
>
maxTiles
?
START_TILE_INDEX+warp*
(
END_TILE_INDEX-START_TILE_INDEX
)
/totalWarps
:
warp*numTiles/totalWarps
)
;
unsigned
int
pos
=
(
numTiles
>
maxTiles
?
startTileIndex+warp*
(
endTileIndex-startTileIndex
)
/totalWarps
:
warp*numTiles/totalWarps
)
;
unsigned
int
end
=
(
numTiles
>
maxTiles
?
START_TILE_INDEX+
(
warp+1
)
*
(
END_TILE_INDEX-START_TILE_INDEX
)
/totalWarps
:
(
warp+1
)
*numTiles/totalWarps
)
;
unsigned
int
end
=
(
numTiles
>
maxTiles
?
startTileIndex+
(
warp+1
)
*
(
endTileIndex-startTileIndex
)
/totalWarps
:
(
warp+1
)
*numTiles/totalWarps
)
;
#
else
#
else
unsigned
int
pos
=
START_TILE_INDEX
+warp*numTiles/totalWarps
;
unsigned
int
pos
=
startTileIndex
+warp*numTiles/totalWarps
;
unsigned
int
end
=
START_TILE_INDEX
+
(
warp+1
)
*numTiles/totalWarps
;
unsigned
int
end
=
startTileIndex
+
(
warp+1
)
*numTiles/totalWarps
;
#
endif
#
endif
float
energy
=
0.0f
;
float
energy
=
0.0f
;
unsigned
int
lasty
=
0xFFFFFFFF
;
unsigned
int
lasty
=
0xFFFFFFFF
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment