Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
3b6925ae
Commit
3b6925ae
authored
Jan 26, 2017
by
Andy Simmonett
Committed by
GitHub
Jan 26, 2017
Browse files
Merge pull request #1 from peastman/ljpme
Cleanup to LJ PME code
parents
5a8a8aa9
f7a102fb
Changes
193
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
487 additions
and
150 deletions
+487
-150
platforms/cuda/src/CudaNonbondedUtilities.cpp
platforms/cuda/src/CudaNonbondedUtilities.cpp
+48
-23
platforms/cuda/src/CudaParallelKernels.cpp
platforms/cuda/src/CudaParallelKernels.cpp
+19
-14
platforms/cuda/src/CudaPlatform.cpp
platforms/cuda/src/CudaPlatform.cpp
+4
-0
platforms/cuda/src/CudaSort.cpp
platforms/cuda/src/CudaSort.cpp
+2
-2
platforms/cuda/src/kernels/coulombLennardJones.cu
platforms/cuda/src/kernels/coulombLennardJones.cu
+45
-1
platforms/cuda/src/kernels/customGBEnergyN2.cu
platforms/cuda/src/kernels/customGBEnergyN2.cu
+10
-6
platforms/cuda/src/kernels/customGBValueN2.cu
platforms/cuda/src/kernels/customGBValueN2.cu
+1
-1
platforms/cuda/src/kernels/customIntegratorPerDof.cu
platforms/cuda/src/kernels/customIntegratorPerDof.cu
+1
-1
platforms/cuda/src/kernels/customNonbonded.cu
platforms/cuda/src/kernels/customNonbonded.cu
+4
-2
platforms/cuda/src/kernels/findInteractingBlocks.cu
platforms/cuda/src/kernels/findInteractingBlocks.cu
+122
-27
platforms/cuda/src/kernels/gbsaObc1.cu
platforms/cuda/src/kernels/gbsaObc1.cu
+11
-7
platforms/cuda/src/kernels/langevin.cu
platforms/cuda/src/kernels/langevin.cu
+4
-4
platforms/cuda/src/kernels/nonbonded.cu
platforms/cuda/src/kernels/nonbonded.cu
+60
-6
platforms/cuda/src/kernels/pme.cu
platforms/cuda/src/kernels/pme.cu
+66
-8
platforms/cuda/src/kernels/sort.cu
platforms/cuda/src/kernels/sort.cu
+11
-15
platforms/opencl/include/OpenCLKernels.h
platforms/opencl/include/OpenCLKernels.h
+13
-3
platforms/opencl/include/OpenCLParallelKernels.h
platforms/opencl/include/OpenCLParallelKernels.h
+10
-1
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+50
-29
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+2
-0
platforms/opencl/src/OpenCLParallelKernels.cpp
platforms/opencl/src/OpenCLParallelKernels.cpp
+4
-0
No files found.
platforms/cuda/src/CudaNonbondedUtilities.cpp
View file @
3b6925ae
...
...
@@ -64,15 +64,16 @@ private:
CudaNonbondedUtilities
::
CudaNonbondedUtilities
(
CudaContext
&
context
)
:
context
(
context
),
useCutoff
(
false
),
usePeriodic
(
false
),
anyExclusions
(
false
),
usePadding
(
true
),
exclusionIndices
(
NULL
),
exclusionRowIndices
(
NULL
),
exclusionTiles
(
NULL
),
exclusions
(
NULL
),
interactingTiles
(
NULL
),
interactingAtoms
(
NULL
),
interactionCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
),
sortedBlocks
(
NULL
),
sortedBlockCenter
(
NULL
),
sortedBlockBoundingBox
(
NULL
),
oldPositions
(
NULL
),
rebuildNeighborList
(
NULL
),
blockSorter
(
NULL
),
pinnedCountBuffer
(
NULL
),
forceRebuildNeighborList
(
true
),
lastCutoff
(
0.0
),
groupFlags
(
0
)
{
interactionCount
(
NULL
),
singlePairs
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
),
sortedBlocks
(
NULL
),
sortedBlockCenter
(
NULL
),
sortedBlockBoundingBox
(
NULL
),
oldPositions
(
NULL
),
rebuildNeighborList
(
NULL
),
blockSorter
(
NULL
),
pinnedCountBuffer
(
NULL
),
forceRebuildNeighborList
(
true
),
lastCutoff
(
0.0
),
groupFlags
(
0
),
canUsePairList
(
true
)
{
// Decide how many thread blocks to use.
string
errorMessage
=
"Error initializing nonbonded utilities"
;
int
multiprocessors
;
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
context
.
getDevice
()));
CHECK_RESULT
(
cuEventCreate
(
&
downloadCountEvent
,
0
));
CHECK_RESULT
(
cuMemHostAlloc
((
void
**
)
&
pinnedCountBuffer
,
sizeof
(
int
),
CU_MEMHOSTALLOC_PORTABLE
));
CHECK_RESULT
(
cuMemHostAlloc
((
void
**
)
&
pinnedCountBuffer
,
2
*
sizeof
(
int
),
CU_MEMHOSTALLOC_PORTABLE
));
numForceThreadBlocks
=
4
*
multiprocessors
;
forceThreadBlockSize
=
(
context
.
getComputeCapability
()
<
2.0
?
128
:
256
);
}
...
...
@@ -92,6 +93,8 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
delete
interactingAtoms
;
if
(
interactionCount
!=
NULL
)
delete
interactionCount
;
if
(
singlePairs
!=
NULL
)
delete
singlePairs
;
if
(
blockCenter
!=
NULL
)
delete
blockCenter
;
if
(
blockBoundingBox
!=
NULL
)
...
...
@@ -113,7 +116,7 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
cuEventDestroy
(
downloadCountEvent
);
}
void
CudaNonbondedUtilities
::
addInteraction
(
bool
usesCutoff
,
bool
usesPeriodic
,
bool
usesExclusions
,
double
cutoffDistance
,
const
vector
<
vector
<
int
>
>&
exclusionList
,
const
string
&
kernel
,
int
forceGroup
)
{
void
CudaNonbondedUtilities
::
addInteraction
(
bool
usesCutoff
,
bool
usesPeriodic
,
bool
usesExclusions
,
double
cutoffDistance
,
const
vector
<
vector
<
int
>
>&
exclusionList
,
const
string
&
kernel
,
int
forceGroup
,
bool
supportsPairList
)
{
if
(
groupCutoff
.
size
()
>
0
)
{
if
(
usesCutoff
!=
useCutoff
)
throw
OpenMMException
(
"All Forces must agree on whether to use a cutoff"
);
...
...
@@ -128,6 +131,7 @@ void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic,
usePeriodic
=
usesPeriodic
;
groupCutoff
[
forceGroup
]
=
cutoffDistance
;
groupFlags
|=
1
<<
forceGroup
;
canUsePairList
&=
supportsPairList
;
if
(
kernel
.
size
()
>
0
)
{
if
(
groupKernelSource
.
find
(
forceGroup
)
==
groupKernelSource
.
end
())
groupKernelSource
[
forceGroup
]
=
""
;
...
...
@@ -279,9 +283,11 @@ void CudaNonbondedUtilities::initialize(const System& system) {
maxTiles
=
numTiles
;
if
(
maxTiles
<
1
)
maxTiles
=
1
;
maxSinglePairs
=
5
*
numAtoms
;
interactingTiles
=
CudaArray
::
create
<
int
>
(
context
,
maxTiles
,
"interactingTiles"
);
interactingAtoms
=
CudaArray
::
create
<
int
>
(
context
,
CudaContext
::
TileSize
*
maxTiles
,
"interactingAtoms"
);
interactionCount
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
1
,
"interactionCount"
);
interactionCount
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
2
,
"interactionCount"
);
singlePairs
=
CudaArray
::
create
<
int2
>
(
context
,
maxSinglePairs
,
"singlePairs"
);
int
elementSize
=
(
context
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
blockCenter
=
new
CudaArray
(
context
,
numAtomBlocks
,
4
*
elementSize
,
"blockCenter"
);
blockBoundingBox
=
new
CudaArray
(
context
,
numAtomBlocks
,
4
*
elementSize
,
"blockBoundingBox"
);
...
...
@@ -291,7 +297,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
oldPositions
=
new
CudaArray
(
context
,
numAtoms
,
4
*
elementSize
,
"oldPositions"
);
rebuildNeighborList
=
CudaArray
::
create
<
int
>
(
context
,
1
,
"rebuildNeighborList"
);
blockSorter
=
new
CudaSort
(
context
,
new
BlockSortTrait
(
context
.
getUseDoublePrecision
()),
numAtomBlocks
);
vector
<
unsigned
int
>
count
(
1
,
0
);
vector
<
unsigned
int
>
count
(
2
,
0
);
interactionCount
->
upload
(
count
);
}
...
...
@@ -316,6 +322,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
forceArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
forceArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
forceArgs
.
push_back
(
&
interactingAtoms
->
getDevicePointer
());
forceArgs
.
push_back
(
&
maxSinglePairs
);
forceArgs
.
push_back
(
&
singlePairs
->
getDevicePointer
());
}
for
(
int
i
=
0
;
i
<
(
int
)
parameters
.
size
();
i
++
)
forceArgs
.
push_back
(
&
parameters
[
i
].
getMemory
());
...
...
@@ -353,8 +361,10 @@ void CudaNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactingTiles
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactingAtoms
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
singlePairs
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
maxTiles
);
findInteractingBlocksArgs
.
push_back
(
&
maxSinglePairs
);
findInteractingBlocksArgs
.
push_back
(
&
startBlockIndex
);
findInteractingBlocksArgs
.
push_back
(
&
numBlocks
);
findInteractingBlocksArgs
.
push_back
(
&
sortedBlocks
->
getDevicePointer
());
...
...
@@ -424,28 +434,39 @@ void CudaNonbondedUtilities::computeInteractions(int forceGroups, bool includeFo
bool
CudaNonbondedUtilities
::
updateNeighborListSize
()
{
if
(
!
useCutoff
)
return
false
;
if
(
pinnedCountBuffer
[
0
]
<=
(
unsigned
int
)
maxTile
s
)
if
(
pinnedCountBuffer
[
0
]
<=
maxTiles
&&
pinnedCountBuffer
[
1
]
<=
maxSinglePair
s
)
return
false
;
// The most recent timestep had too many interactions to fit in the arrays. Make the arrays bigger to prevent
// this from happening in the future.
maxTiles
=
(
int
)
(
1.2
*
pinnedCountBuffer
[
0
]);
int
totalTiles
=
context
.
getNumAtomBlocks
()
*
(
context
.
getNumAtomBlocks
()
+
1
)
/
2
;
if
(
maxTiles
>
totalTiles
)
maxTiles
=
totalTiles
;
delete
interactingTiles
;
delete
interactingAtoms
;
interactingTiles
=
NULL
;
// Avoid an error in the destructor if the following allocation fails
interactingAtoms
=
NULL
;
interactingTiles
=
CudaArray
::
create
<
int
>
(
context
,
maxTiles
,
"interactingTiles"
);
interactingAtoms
=
CudaArray
::
create
<
int
>
(
context
,
CudaContext
::
TileSize
*
maxTiles
,
"interactingAtoms"
);
if
(
forceArgs
.
size
()
>
0
)
forceArgs
[
7
]
=
&
interactingTiles
->
getDevicePointer
();
findInteractingBlocksArgs
[
6
]
=
&
interactingTiles
->
getDevicePointer
();
if
(
forceArgs
.
size
()
>
0
)
forceArgs
[
17
]
=
&
interactingAtoms
->
getDevicePointer
();
findInteractingBlocksArgs
[
7
]
=
&
interactingAtoms
->
getDevicePointer
();
if
(
pinnedCountBuffer
[
0
]
>
maxTiles
)
{
maxTiles
=
(
int
)
(
1.2
*
pinnedCountBuffer
[
0
]);
int
totalTiles
=
context
.
getNumAtomBlocks
()
*
(
context
.
getNumAtomBlocks
()
+
1
)
/
2
;
if
(
maxTiles
>
totalTiles
)
maxTiles
=
totalTiles
;
delete
interactingTiles
;
delete
interactingAtoms
;
interactingTiles
=
NULL
;
// Avoid an error in the destructor if the following allocation fails
interactingAtoms
=
NULL
;
interactingTiles
=
CudaArray
::
create
<
int
>
(
context
,
maxTiles
,
"interactingTiles"
);
interactingAtoms
=
CudaArray
::
create
<
int
>
(
context
,
CudaContext
::
TileSize
*
maxTiles
,
"interactingAtoms"
);
if
(
forceArgs
.
size
()
>
0
)
forceArgs
[
7
]
=
&
interactingTiles
->
getDevicePointer
();
findInteractingBlocksArgs
[
6
]
=
&
interactingTiles
->
getDevicePointer
();
if
(
forceArgs
.
size
()
>
0
)
forceArgs
[
17
]
=
&
interactingAtoms
->
getDevicePointer
();
findInteractingBlocksArgs
[
7
]
=
&
interactingAtoms
->
getDevicePointer
();
}
if
(
pinnedCountBuffer
[
1
]
>
maxSinglePairs
)
{
maxSinglePairs
=
(
int
)
(
1.2
*
pinnedCountBuffer
[
1
]);
delete
singlePairs
;
singlePairs
=
NULL
;
// Avoid an error in the destructor if the following allocation fails
singlePairs
=
CudaArray
::
create
<
int2
>
(
context
,
maxSinglePairs
,
"singlePairs"
);
if
(
forceArgs
.
size
()
>
0
)
forceArgs
[
19
]
=
&
singlePairs
->
getDevicePointer
();
findInteractingBlocksArgs
[
8
]
=
&
singlePairs
->
getDevicePointer
();
}
forceRebuildNeighborList
=
true
;
context
.
setForcesValid
(
false
);
return
true
;
...
...
@@ -492,7 +513,11 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) {
defines
[
"NUM_TILES_WITH_EXCLUSIONS"
]
=
context
.
intToString
(
exclusionTiles
->
getSize
());
if
(
usePeriodic
)
defines
[
"USE_PERIODIC"
]
=
"1"
;
if
(
context
.
getBoxIsTriclinic
())
defines
[
"TRICLINIC"
]
=
"1"
;
defines
[
"MAX_EXCLUSIONS"
]
=
context
.
intToString
(
maxExclusions
);
// Temporarily disable the pair list until we figure out why it's failing on some GPUs.
defines
[
"MAX_BITS_FOR_PAIRS"
]
=
"0"
;
//(canUsePairList ? "2" : "0");
CUmodule
interactingBlocksProgram
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
findInteractingBlocks
,
defines
);
kernels
.
findBlockBoundsKernel
=
context
.
getKernel
(
interactingBlocksProgram
,
"findBlockBounds"
);
kernels
.
sortBoxDataKernel
=
context
.
getKernel
(
interactingBlocksProgram
,
"sortBoxData"
);
...
...
platforms/cuda/src/CudaParallelKernels.cpp
View file @
3b6925ae
...
...
@@ -63,8 +63,8 @@ if (result != CUDA_SUCCESS) { \
class
CudaParallelCalcForcesAndEnergyKernel
::
BeginComputationTask
:
public
CudaContext
::
WorkTask
{
public:
BeginComputationTask
(
ContextImpl
&
context
,
CudaContext
&
cu
,
CudaCalcForcesAndEnergyKernel
&
kernel
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
void
*
pinnedMemory
,
CUevent
event
,
int
&
numTiles
)
:
context
(
context
),
cu
(
cu
),
kernel
(
kernel
),
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
groups
(
groups
),
pinnedMemory
(
pinnedMemory
),
event
(
event
),
numTiles
(
numTiles
)
{
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
void
*
pinnedMemory
,
CUevent
event
,
int
2
&
interactionCount
)
:
context
(
context
),
cu
(
cu
),
kernel
(
kernel
),
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
groups
(
groups
),
pinnedMemory
(
pinnedMemory
),
event
(
event
),
interactionCount
(
interactionCount
)
{
}
void
execute
()
{
// Copy coordinates over to this device and execute the kernel.
...
...
@@ -77,7 +77,7 @@ public:
}
kernel
.
beginComputation
(
context
,
includeForce
,
includeEnergy
,
groups
);
if
(
cu
.
getNonbondedUtilities
().
getUsePeriodic
())
cu
.
getNonbondedUtilities
().
getInteractionCount
().
download
(
&
numTiles
,
false
);
cu
.
getNonbondedUtilities
().
getInteractionCount
().
download
(
&
interactionCount
,
false
);
}
private:
ContextImpl
&
context
;
...
...
@@ -87,15 +87,15 @@ private:
int
groups
;
void
*
pinnedMemory
;
CUevent
event
;
int
&
numTiles
;
int
2
&
interactionCount
;
};
class
CudaParallelCalcForcesAndEnergyKernel
::
FinishComputationTask
:
public
CudaContext
::
WorkTask
{
public:
FinishComputationTask
(
ContextImpl
&
context
,
CudaContext
&
cu
,
CudaCalcForcesAndEnergyKernel
&
kernel
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
double
&
energy
,
long
long
&
completionTime
,
long
long
*
pinnedMemory
,
CudaArray
&
contextForces
,
bool
&
valid
,
int
&
numTiles
)
:
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
double
&
energy
,
long
long
&
completionTime
,
long
long
*
pinnedMemory
,
CudaArray
&
contextForces
,
bool
&
valid
,
int
2
&
interactionCount
)
:
context
(
context
),
cu
(
cu
),
kernel
(
kernel
),
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
groups
(
groups
),
energy
(
energy
),
completionTime
(
completionTime
),
pinnedMemory
(
pinnedMemory
),
contextForces
(
contextForces
),
valid
(
valid
),
numTiles
(
numTiles
)
{
completionTime
(
completionTime
),
pinnedMemory
(
pinnedMemory
),
contextForces
(
contextForces
),
valid
(
valid
),
interactionCount
(
interactionCount
)
{
}
void
execute
()
{
// Execute the kernel, then download forces.
...
...
@@ -120,7 +120,8 @@ public:
cu
.
getForce
().
download
(
&
pinnedMemory
[(
cu
.
getContextIndex
()
-
1
)
*
numAtoms
*
3
]);
}
}
if
(
cu
.
getNonbondedUtilities
().
getUsePeriodic
()
&&
numTiles
>
cu
.
getNonbondedUtilities
().
getInteractingTiles
().
getSize
())
{
if
(
cu
.
getNonbondedUtilities
().
getUsePeriodic
()
&&
(
interactionCount
.
x
>
cu
.
getNonbondedUtilities
().
getInteractingTiles
().
getSize
()
||
interactionCount
.
y
>
cu
.
getNonbondedUtilities
().
getSinglePairs
().
getSize
()))
{
valid
=
false
;
cu
.
getNonbondedUtilities
().
updateNeighborListSize
();
}
...
...
@@ -136,12 +137,12 @@ private:
long
long
*
pinnedMemory
;
CudaArray
&
contextForces
;
bool
&
valid
;
int
&
numTiles
;
int
2
&
interactionCount
;
};
CudaParallelCalcForcesAndEnergyKernel
::
CudaParallelCalcForcesAndEnergyKernel
(
string
name
,
const
Platform
&
platform
,
CudaPlatform
::
PlatformData
&
data
)
:
CalcForcesAndEnergyKernel
(
name
,
platform
),
data
(
data
),
completionTimes
(
data
.
contexts
.
size
()),
contextNonbondedFractions
(
data
.
contexts
.
size
()),
tile
Counts
(
NULL
),
contextForces
(
NULL
),
pinnedPositionBuffer
(
NULL
),
pinnedForceBuffer
(
NULL
)
{
interaction
Counts
(
NULL
),
contextForces
(
NULL
),
pinnedPositionBuffer
(
NULL
),
pinnedForceBuffer
(
NULL
)
{
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
kernels
.
push_back
(
Kernel
(
new
CudaCalcForcesAndEnergyKernel
(
name
,
platform
,
*
data
.
contexts
[
i
])));
}
...
...
@@ -156,8 +157,8 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
cuMemFreeHost
(
pinnedForceBuffer
);
cuEventDestroy
(
event
);
cuStreamDestroy
(
peerCopyStream
);
if
(
tile
Counts
!=
NULL
)
cuMemFreeHost
(
tile
Counts
);
if
(
interaction
Counts
!=
NULL
)
cuMemFreeHost
(
interaction
Counts
);
}
void
CudaParallelCalcForcesAndEnergyKernel
::
initialize
(
const
System
&
system
)
{
...
...
@@ -172,7 +173,7 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
contextNonbondedFractions
[
i
]
=
1
/
(
double
)
numContexts
;
CHECK_RESULT
(
cuEventCreate
(
&
event
,
0
),
"Error creating event"
);
CHECK_RESULT
(
cuStreamCreate
(
&
peerCopyStream
,
CU_STREAM_NON_BLOCKING
),
"Error creating stream"
);
CHECK_RESULT
(
cuMemHostAlloc
((
void
**
)
&
tile
Counts
,
numContexts
*
sizeof
(
int
),
0
),
"Error creating
tile
count buffer"
);
CHECK_RESULT
(
cuMemHostAlloc
((
void
**
)
&
interaction
Counts
,
numContexts
*
sizeof
(
int
2
),
0
),
"Error creating
interaction
count
s
buffer"
);
}
void
CudaParallelCalcForcesAndEnergyKernel
::
beginComputation
(
ContextImpl
&
context
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
)
{
...
...
@@ -202,7 +203,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
data
.
contextEnergy
[
i
]
=
0.0
;
CudaContext
&
cu
=
*
data
.
contexts
[
i
];
CudaContext
::
WorkThread
&
thread
=
cu
.
getWorkThread
();
thread
.
addTask
(
new
BeginComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
pinnedPositionBuffer
,
event
,
tile
Counts
[
i
]));
thread
.
addTask
(
new
BeginComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
pinnedPositionBuffer
,
event
,
interaction
Counts
[
i
]));
}
}
...
...
@@ -210,7 +211,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
{
CudaContext
&
cu
=
*
data
.
contexts
[
i
];
CudaContext
::
WorkThread
&
thread
=
cu
.
getWorkThread
();
thread
.
addTask
(
new
FinishComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
data
.
contextEnergy
[
i
],
completionTimes
[
i
],
pinnedForceBuffer
,
*
contextForces
,
valid
,
tile
Counts
[
i
]));
thread
.
addTask
(
new
FinishComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
data
.
contextEnergy
[
i
],
completionTimes
[
i
],
pinnedForceBuffer
,
*
contextForces
,
valid
,
interaction
Counts
[
i
]));
}
data
.
syncContexts
();
double
energy
=
0.0
;
...
...
@@ -627,6 +628,10 @@ void CudaParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int&
dynamic_cast
<
const
CudaCalcNonbondedForceKernel
&>
(
kernels
[
0
].
getImpl
()).
getPMEParameters
(
alpha
,
nx
,
ny
,
nz
);
}
void
CudaParallelCalcNonbondedForceKernel
::
getLJPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
{
dynamic_cast
<
const
CudaCalcNonbondedForceKernel
&>
(
kernels
[
0
].
getImpl
()).
getLJPMEParameters
(
alpha
,
nx
,
ny
,
nz
);
}
class
CudaParallelCalcCustomNonbondedForceKernel
::
Task
:
public
CudaContext
::
WorkTask
{
public:
Task
(
ContextImpl
&
context
,
CudaCalcCustomNonbondedForceKernel
&
kernel
,
bool
includeForce
,
...
...
platforms/cuda/src/CudaPlatform.cpp
View file @
3b6925ae
...
...
@@ -247,6 +247,10 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
CHECK_RESULT
(
cuDeviceGetName
(
name
,
1000
,
contexts
[
i
]
->
getDevice
()),
"Error querying device name"
);
deviceName
<<
name
;
}
size_t
printfsize
;
cuCtxGetLimit
(
&
printfsize
,
CU_LIMIT_PRINTF_FIFO_SIZE
);
cuCtxSetLimit
(
CU_LIMIT_PRINTF_FIFO_SIZE
,
10
*
printfsize
);
useCpuPme
=
(
cpuPmeProperty
==
"true"
&&
!
contexts
[
0
]
->
getUseDoublePrecision
());
disablePmeStream
=
(
pmeStreamProperty
==
"true"
);
deterministicForces
=
(
deterministicForcesProperty
==
"true"
);
...
...
platforms/cuda/src/CudaSort.cpp
View file @
3b6925ae
...
...
@@ -114,13 +114,13 @@ void CudaSort::sort(CudaArray& data) {
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
void
*
rangeArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
,
&
dataRange
->
getDevicePointer
(),
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
context
.
executeKernel
(
computeRangeKernel
,
rangeArgs
,
rangeKernelSize
,
rangeKernelSize
,
rangeKernelSize
*
trait
->
getKeySize
());
context
.
executeKernel
(
computeRangeKernel
,
rangeArgs
,
rangeKernelSize
,
rangeKernelSize
,
2
*
rangeKernelSize
*
trait
->
getKeySize
());
// Assign array elements to buckets.
void
*
elementsArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
,
&
numBuckets
,
&
dataRange
->
getDevicePointer
(),
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
assignElementsKernel
,
elementsArgs
,
data
.
getSize
());
context
.
executeKernel
(
assignElementsKernel
,
elementsArgs
,
data
.
getSize
()
,
128
);
// Compute the position of each bucket.
...
...
platforms/cuda/src/kernels/coulombLennardJones.cu
View file @
3b6925ae
...
...
@@ -17,6 +17,26 @@
const
real
erfcAlphaR
=
(
0.254829592
f
+
(
-
0.284496736
f
+
(
1.421413741
f
+
(
-
1.453152027
f
+
1.061405429
f
*
t
)
*
t
)
*
t
)
*
t
)
*
t
*
expAlphaRSqr
;
#endif
real
tempForce
=
0.0
f
;
#if HAS_LENNARD_JONES
// The multiplicative term to correct for the multiplicative terms that are always
// present in reciprocal space. The real terms have an additive contribution
// added in, but for excluded terms the multiplicative term is just subtracted.
// These factors are needed in both clauses of the needCorrection statement, so
// I declare them up here.
#if DO_LJPME
const
real
dispersionAlphaR
=
EWALD_DISPERSION_ALPHA
*
r
;
const
real
dar2
=
dispersionAlphaR
*
dispersionAlphaR
;
const
real
dar4
=
dar2
*
dar2
;
const
real
dar6
=
dar4
*
dar2
;
const
real
invR2
=
invR
*
invR
;
const
real
expDar2
=
EXP
(
-
dar2
);
const
real2
sigExpProd
=
sigmaEpsilon1
*
sigmaEpsilon2
;
const
real
c6
=
64
*
sigExpProd
.
x
*
sigExpProd
.
x
*
sigExpProd
.
x
*
sigExpProd
.
y
;
const
real
coef
=
invR2
*
invR2
*
invR2
*
c6
;
const
real
eprefac
=
1.0
f
+
dar2
+
0.5
f
*
dar4
;
const
real
dprefac
=
eprefac
+
dar6
/
6.0
f
;
#endif
#endif
if
(
needCorrection
)
{
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
...
...
@@ -29,6 +49,13 @@
includeInteraction
=
false
;
tempEnergy
-=
TWO_OVER_SQRT_PI
*
EWALD_ALPHA
*
138.935456
f
*
posq1
.
w
*
posq2
.
w
;
}
#if HAS_LENNARD_JONES
#if DO_LJPME
// The multiplicative grid term
tempEnergy
+=
coef
*
(
1.0
f
-
expDar2
*
eprefac
);
tempForce
+=
6.0
f
*
coef
*
(
1.0
f
-
expDar2
*
dprefac
);
#endif
#endif
}
else
{
#if HAS_LENNARD_JONES
...
...
@@ -36,7 +63,8 @@
real
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
real
sig6
=
sig2
*
sig2
*
sig2
;
real
epssig6
=
sig6
*
(
sigmaEpsilon1
.
y
*
sigmaEpsilon2
.
y
);
real
eps
=
sigmaEpsilon1
.
y
*
sigmaEpsilon2
.
y
;
real
epssig6
=
sig6
*
eps
;
tempForce
=
epssig6
*
(
12.0
f
*
sig6
-
6.0
f
);
real
ljEnergy
=
epssig6
*
(
sig6
-
1.0
f
);
#if USE_LJ_SWITCH
...
...
@@ -48,6 +76,22 @@
ljEnergy
*=
switchValue
;
}
#endif
#if DO_LJPME
// The multiplicative grid term
ljEnergy
+=
coef
*
(
1.0
f
-
expDar2
*
eprefac
);
tempForce
+=
6.0
f
*
coef
*
(
1.0
f
-
expDar2
*
dprefac
);
// The potential shift accounts for the step at the cutoff introduced by the
// transition from additive to multiplicative combintion rules and is only
// needed for the real (not excluded) terms. By addin these terms to ljEnergy
// instead of tempEnergy here, the includeInteraction mask is correctly applied.
sig2
=
sig
*
sig
;
sig6
=
sig2
*
sig2
*
sig2
*
INVCUT6
;
epssig6
=
eps
*
sig6
;
// The additive part of the potential shift
ljEnergy
+=
epssig6
*
(
1.0
f
-
sig6
);
// The multiplicative part of the potential shift
ljEnergy
+=
MULTSHIFT6
*
c6
;
#endif
tempForce
+=
prefactor
*
(
erfcAlphaR
+
alphaR
*
expAlphaRSqr
*
TWO_OVER_SQRT_PI
);
tempEnergy
+=
includeInteraction
?
ljEnergy
+
prefactor
*
erfcAlphaR
:
0
;
#else
...
...
platforms/cuda/src/kernels/customGBEnergyN2.cu
View file @
3b6925ae
...
...
@@ -14,7 +14,7 @@ typedef struct {
* Compute a force based on pair interactions.
*/
extern
"C"
__global__
void
computeN2Energy
(
unsigned
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
ushort2
*
__restrict__
exclusionTiles
,
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
ushort2
*
__restrict__
exclusionTiles
,
bool
needEnergy
,
#ifdef USE_CUTOFF
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
...
...
@@ -78,7 +78,8 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
COMPUTE_INTERACTION
dEdR
/=
-
r
;
}
energy
+=
0.5
f
*
tempEnergy
;
if
(
needEnergy
)
energy
+=
0.5
f
*
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
...
...
@@ -130,7 +131,8 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
COMPUTE_INTERACTION
dEdR
/=
-
r
;
}
energy
+=
tempEnergy
;
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
...
...
@@ -234,7 +236,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
)
;
unsigned
int
j
=
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
];
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
...
...
@@ -274,7 +276,8 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
COMPUTE_INTERACTION
dEdR
/=
-
r
;
}
energy
+=
tempEnergy
;
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
...
...
@@ -318,7 +321,8 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
COMPUTE_INTERACTION
dEdR
/=
-
r
;
}
energy
+=
tempEnergy
;
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
...
...
platforms/cuda/src/kernels/customGBValueN2.cu
View file @
3b6925ae
...
...
@@ -212,7 +212,7 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
)
;
unsigned
int
j
=
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
];
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
...
...
platforms/cuda/src/kernels/customIntegratorPerDof.cu
View file @
3b6925ae
...
...
@@ -34,7 +34,7 @@ inline __device__ mixed4 convertFromDouble4(double4 a) {
extern
"C"
__global__
void
computePerDof
(
real4
*
__restrict__
posq
,
real4
*
__restrict__
posqCorrection
,
mixed4
*
__restrict__
posDelta
,
mixed4
*
__restrict__
velm
,
const
long
long
*
__restrict__
force
,
const
mixed2
*
__restrict__
dt
,
const
mixed
*
__restrict__
globals
,
mixed
*
__restrict__
sum
,
const
float4
*
__restrict__
gaussianValues
,
unsigned
int
gaussianBaseIndex
,
const
float4
*
__restrict__
uniformValues
,
const
real
energy
,
mixed
*
__restrict__
energyParamDerivs
const
mixed
energy
,
mixed
*
__restrict__
energyParamDerivs
PARAMETER_ARGUMENTS
)
{
mixed
stepSize
=
dt
[
0
].
y
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
...
...
platforms/cuda/src/kernels/customNonbonded.cu
View file @
3b6925ae
...
...
@@ -14,8 +14,10 @@ if (!isExcluded) {
#endif
COMPUTE_FORCE
#if USE_SWITCH
tempForce
=
tempForce
*
switchValue
-
tempEnergy
*
switchDeriv
;
tempEnergy
*=
switchValue
;
tempForce
=
tempForce
*
switchValue
-
customEnergy
*
switchDeriv
;
tempEnergy
+=
customEnergy
*
switchValue
;
#else
tempEnergy
+=
customEnergy
;
#endif
dEdR
+=
tempForce
*
invR
;
}
platforms/cuda/src/kernels/findInteractingBlocks.cu
View file @
3b6925ae
...
...
@@ -27,8 +27,19 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
maxPos
=
make_real4
(
max
(
maxPos
.
x
,
pos
.
x
),
max
(
maxPos
.
y
,
pos
.
y
),
max
(
maxPos
.
z
,
pos
.
z
),
0
);
}
real4
blockSize
=
0.5
f
*
(
maxPos
-
minPos
);
real4
center
=
0.5
f
*
(
maxPos
+
minPos
);
center
.
w
=
0
;
for
(
int
i
=
base
;
i
<
last
;
i
++
)
{
pos
=
posq
[
i
];
real4
delta
=
posq
[
i
]
-
center
;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
center
.
w
=
max
(
center
.
w
,
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
);
}
center
.
w
=
sqrt
(
center
.
w
);
blockBoundingBox
[
index
]
=
blockSize
;
blockCenter
[
index
]
=
0.5
f
*
(
maxPos
+
minPos
)
;
blockCenter
[
index
]
=
center
;
sortedBlocks
[
index
]
=
make_real2
(
blockSize
.
x
+
blockSize
.
y
+
blockSize
.
z
,
index
);
index
+=
blockDim
.
x
*
gridDim
.
x
;
base
=
index
*
TILE_SIZE
;
...
...
@@ -61,9 +72,60 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
if
(
rebuild
)
{
rebuildNeighborList
[
0
]
=
1
;
interactionCount
[
0
]
=
0
;
interactionCount
[
1
]
=
0
;
}
}
__device__
int
saveSinglePairs
(
int
x
,
int
*
atoms
,
int
*
flags
,
int
length
,
unsigned
int
maxSinglePairs
,
unsigned
int
*
singlePairCount
,
int2
*
singlePairs
,
int
*
sumBuffer
,
volatile
int
&
pairStartIndex
)
{
// Record interactions that should be computed as single pairs rather than in blocks.
const
int
indexInWarp
=
threadIdx
.
x
%
32
;
int
sum
=
0
;
for
(
int
i
=
indexInWarp
;
i
<
length
;
i
+=
32
)
{
int
count
=
__popc
(
flags
[
i
]);
sum
+=
(
count
<=
MAX_BITS_FOR_PAIRS
?
count
:
0
);
}
sumBuffer
[
indexInWarp
]
=
sum
;
for
(
int
step
=
1
;
step
<
32
;
step
*=
2
)
{
int
add
=
(
indexInWarp
>=
step
?
sumBuffer
[
indexInWarp
-
step
]
:
0
);
sumBuffer
[
indexInWarp
]
+=
add
;
}
int
pairsToStore
=
sumBuffer
[
31
];
if
(
indexInWarp
==
0
)
pairStartIndex
=
atomicAdd
(
singlePairCount
,
pairsToStore
);
int
pairIndex
=
pairStartIndex
+
(
indexInWarp
>
0
?
sumBuffer
[
indexInWarp
-
1
]
:
0
);
for
(
int
i
=
indexInWarp
;
i
<
length
;
i
+=
32
)
{
int
count
=
__popc
(
flags
[
i
]);
if
(
count
<=
MAX_BITS_FOR_PAIRS
&&
pairIndex
+
count
<
maxSinglePairs
)
{
int
f
=
flags
[
i
];
while
(
f
!=
0
)
{
singlePairs
[
pairIndex
]
=
make_int2
(
atoms
[
i
],
x
*
TILE_SIZE
+
__ffs
(
f
)
-
1
);
f
&=
f
-
1
;
pairIndex
++
;
}
}
}
// Compact the remaining interactions.
const
int
warpMask
=
(
1
<<
indexInWarp
)
-
1
;
int
numCompacted
=
0
;
for
(
int
start
=
0
;
start
<
length
;
start
+=
32
)
{
int
i
=
start
+
indexInWarp
;
int
atom
=
atoms
[
i
];
int
flag
=
flags
[
i
];
bool
include
=
(
i
<
length
&&
__popc
(
flags
[
i
])
>
MAX_BITS_FOR_PAIRS
);
int
includeFlags
=
__ballot
(
include
);
if
(
include
)
{
int
index
=
numCompacted
+
__popc
(
includeFlags
&
warpMask
);
atoms
[
index
]
=
atom
;
flags
[
index
]
=
flag
;
}
numCompacted
+=
__popc
(
includeFlags
);
}
return
numCompacted
;
}
/**
* Compare the bounding boxes for each pair of atom blocks (comprised of 32 atoms each), forming a tile. If the two
* atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
...
...
@@ -114,8 +176,9 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
*
*/
extern
"C"
__global__
void
findBlocksWithInteractions
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
*
__restrict__
interactionCount
,
int
*
__restrict__
interactingTiles
,
unsigned
int
*
__restrict__
interactingAtoms
,
const
real4
*
__restrict__
posq
,
unsigned
int
maxTiles
,
unsigned
int
startBlockIndex
,
unsigned
int
numBlocks
,
real2
*
__restrict__
sortedBlocks
,
const
real4
*
__restrict__
sortedBlockCenter
,
unsigned
int
*
__restrict__
interactionCount
,
int
*
__restrict__
interactingTiles
,
unsigned
int
*
__restrict__
interactingAtoms
,
int2
*
__restrict__
singlePairs
,
const
real4
*
__restrict__
posq
,
unsigned
int
maxTiles
,
unsigned
int
maxSinglePairs
,
unsigned
int
startBlockIndex
,
unsigned
int
numBlocks
,
real2
*
__restrict__
sortedBlocks
,
const
real4
*
__restrict__
sortedBlockCenter
,
const
real4
*
__restrict__
sortedBlockBoundingBox
,
const
unsigned
int
*
__restrict__
exclusionIndices
,
const
unsigned
int
*
__restrict__
exclusionRowIndices
,
real4
*
__restrict__
oldPositions
,
const
int
*
__restrict__
rebuildNeighborList
)
{
...
...
@@ -128,12 +191,17 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
const
int
warpIndex
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
32
;
const
int
warpMask
=
(
1
<<
indexInWarp
)
-
1
;
__shared__
int
workgroupBuffer
[
BUFFER_SIZE
*
(
GROUP_SIZE
/
32
)];
__shared__
int
workgroupFlagsBuffer
[
BUFFER_SIZE
*
(
GROUP_SIZE
/
32
)];
__shared__
int
warpExclusions
[
MAX_EXCLUSIONS
*
(
GROUP_SIZE
/
32
)];
__shared__
real3
posBuffer
[
GROUP_SIZE
];
__shared__
volatile
int
workgroupTileIndex
[
GROUP_SIZE
/
32
];
__shared__
int
sumBuffer
[
GROUP_SIZE
];
__shared__
int
worksgroupPairStartIndex
[
GROUP_SIZE
/
32
];
int
*
buffer
=
workgroupBuffer
+
BUFFER_SIZE
*
(
warpStart
/
32
);
int
*
flagsBuffer
=
workgroupFlagsBuffer
+
BUFFER_SIZE
*
(
warpStart
/
32
);
int
*
exclusionsForX
=
warpExclusions
+
MAX_EXCLUSIONS
*
(
warpStart
/
32
);
volatile
int
&
tileStartIndex
=
workgroupTileIndex
[
warpStart
/
32
];
volatile
int
&
pairStartIndex
=
worksgroupPairStartIndex
[
warpStart
/
32
];
// Loop over blocks.
...
...
@@ -176,16 +244,24 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
int
block2
=
block2Base
+
indexInWarp
;
bool
includeBlock2
=
(
block2
<
NUM_BLOCKS
);
if
(
includeBlock2
)
{
real4
blockCenterY
=
(
block2
<
NUM_BLOCKS
?
sortedBlockCenter
[
block2
]
:
make_real4
(
0
))
;
real4
blockSizeY
=
(
block2
<
NUM_BLOCKS
?
sortedBlockBoundingBox
[
block2
]
:
make_real4
(
0
))
;
real4
blockCenterY
=
sortedBlockCenter
[
block2
];
real4
blockSizeY
=
sortedBlockBoundingBox
[
block2
];
real4
blockDelta
=
blockCenterX
-
blockCenterY
;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
blockDelta
)
#endif
includeBlock2
&=
(
blockDelta
.
x
*
blockDelta
.
x
+
blockDelta
.
y
*
blockDelta
.
y
+
blockDelta
.
z
*
blockDelta
.
z
<
(
PADDED_CUTOFF
+
blockCenterX
.
w
+
blockCenterY
.
w
)
*
(
PADDED_CUTOFF
+
blockCenterX
.
w
+
blockCenterY
.
w
));
blockDelta
.
x
=
max
(
0.0
f
,
fabs
(
blockDelta
.
x
)
-
blockSizeX
.
x
-
blockSizeY
.
x
);
blockDelta
.
y
=
max
(
0.0
f
,
fabs
(
blockDelta
.
y
)
-
blockSizeX
.
y
-
blockSizeY
.
y
);
blockDelta
.
z
=
max
(
0.0
f
,
fabs
(
blockDelta
.
z
)
-
blockSizeX
.
z
-
blockSizeY
.
z
);
includeBlock2
&=
(
blockDelta
.
x
*
blockDelta
.
x
+
blockDelta
.
y
*
blockDelta
.
y
+
blockDelta
.
z
*
blockDelta
.
z
<
PADDED_CUTOFF_SQUARED
);
#ifdef TRICLINIC
// The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
// If there's any possibility we might have missed it, do a detailed check.
if
(
periodicBoxSize
.
z
/
2
-
blockSizeX
.
z
-
blockSizeY
.
z
<
PADDED_CUTOFF
||
periodicBoxSize
.
y
/
2
-
blockSizeX
.
y
-
blockSizeY
.
y
<
PADDED_CUTOFF
)
includeBlock2
=
true
;
#endif
if
(
includeBlock2
)
{
unsigned
short
y
=
(
unsigned
short
)
sortedBlocks
[
block2
].
y
;
for
(
int
k
=
0
;
k
<
numExclusions
;
k
++
)
...
...
@@ -203,29 +279,36 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
// Check each atom in block Y for interactions.
int
start
=
y
*
TILE_SIZE
;
int
atom2
=
start
+
indexInWarp
;
int
atom2
=
y
*
TILE_SIZE
+
indexInWarp
;
real3
pos2
=
trimTo3
(
posq
[
atom2
]);
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos2
,
blockCenterX
)
}
#endif
bool
interacts
=
false
;
if
(
atom2
<
NUM_ATOMS
)
{
real4
blockCenterY
=
sortedBlockCenter
[
block2Base
+
i
];
real3
atomDelta
=
posBuffer
[
warpStart
+
indexInWarp
]
-
trimTo3
(
blockCenterY
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
atomDelta
)
#endif
int
atomFlags
=
ballot
(
atomDelta
.
x
*
atomDelta
.
x
+
atomDelta
.
y
*
atomDelta
.
y
+
atomDelta
.
z
*
atomDelta
.
z
<
(
PADDED_CUTOFF
+
blockCenterY
.
w
)
*
(
PADDED_CUTOFF
+
blockCenterY
.
w
));
int
interacts
=
0
;
if
(
atom2
<
NUM_ATOMS
&&
atomFlags
!=
0
)
{
int
first
=
__ffs
(
atomFlags
)
-
1
;
int
last
=
32
-
__clz
(
atomFlags
);
#ifdef USE_PERIODIC
if
(
!
singlePeriodicCopy
)
{
for
(
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
int
j
=
first
;
j
<
last
;
j
++
)
{
real3
delta
=
pos2
-
posBuffer
[
warpStart
+
j
];
APPLY_PERIODIC_TO_DELTA
(
delta
)
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
);
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
?
1
<<
j
:
0
);
}
}
else
{
#endif
for
(
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
int
j
=
first
;
j
<
last
;
j
++
)
{
real3
delta
=
pos2
-
posBuffer
[
warpStart
+
j
];
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
);
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
?
1
<<
j
:
0
);
}
#ifdef USE_PERIODIC
}
...
...
@@ -235,34 +318,46 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
// Add any interacting atoms to the buffer.
int
includeAtomFlags
=
__ballot
(
interacts
);
if
(
interacts
)
buffer
[
neighborsInBuffer
+
__popc
(
includeAtomFlags
&
warpMask
)]
=
atom2
;
if
(
interacts
)
{
int
index
=
neighborsInBuffer
+
__popc
(
includeAtomFlags
&
warpMask
);
buffer
[
index
]
=
atom2
;
flagsBuffer
[
index
]
=
interacts
;
}
neighborsInBuffer
+=
__popc
(
includeAtomFlags
);
if
(
neighborsInBuffer
>
BUFFER_SIZE
-
TILE_SIZE
)
{
// Store the new tiles to memory.
#if MAX_BITS_FOR_PAIRS > 0
neighborsInBuffer
=
saveSinglePairs
(
x
,
buffer
,
flagsBuffer
,
neighborsInBuffer
,
maxSinglePairs
,
&
interactionCount
[
1
],
singlePairs
,
sumBuffer
+
warpStart
,
pairStartIndex
);
#endif
int
tilesToStore
=
neighborsInBuffer
/
TILE_SIZE
;
if
(
indexInWarp
==
0
)
tileStartIndex
=
atomicAdd
(
interactionCount
,
tilesToStore
);
int
newTileStartIndex
=
tileStartIndex
;
if
(
newTileStartIndex
+
tilesToStore
<=
maxTiles
)
{
if
(
indexInWarp
<
tilesToStore
)
interactingTiles
[
newTileStartIndex
+
indexInWarp
]
=
x
;
for
(
int
j
=
0
;
j
<
tilesToStore
;
j
++
)
interactingAtoms
[(
newTileStartIndex
+
j
)
*
TILE_SIZE
+
indexInWarp
]
=
buffer
[
indexInWarp
+
j
*
TILE_SIZE
];
if
(
tilesToStore
>
0
)
{
if
(
indexInWarp
==
0
)
tileStartIndex
=
atomicAdd
(
&
interactionCount
[
0
],
tilesToStore
);
int
newTileStartIndex
=
tileStartIndex
;
if
(
newTileStartIndex
+
tilesToStore
<=
maxTiles
)
{
if
(
indexInWarp
<
tilesToStore
)
interactingTiles
[
newTileStartIndex
+
indexInWarp
]
=
x
;
for
(
int
j
=
0
;
j
<
tilesToStore
;
j
++
)
interactingAtoms
[(
newTileStartIndex
+
j
)
*
TILE_SIZE
+
indexInWarp
]
=
buffer
[
indexInWarp
+
j
*
TILE_SIZE
];
}
buffer
[
indexInWarp
]
=
buffer
[
indexInWarp
+
TILE_SIZE
*
tilesToStore
];
neighborsInBuffer
-=
TILE_SIZE
*
tilesToStore
;
}
buffer
[
indexInWarp
]
=
buffer
[
indexInWarp
+
TILE_SIZE
*
tilesToStore
];
neighborsInBuffer
-=
TILE_SIZE
*
tilesToStore
;
}
}
}
// If we have a partially filled buffer, store it to memory.
#if MAX_BITS_FOR_PAIRS > 0
if
(
neighborsInBuffer
>
32
)
neighborsInBuffer
=
saveSinglePairs
(
x
,
buffer
,
flagsBuffer
,
neighborsInBuffer
,
maxSinglePairs
,
&
interactionCount
[
1
],
singlePairs
,
sumBuffer
+
warpStart
,
pairStartIndex
);
#endif
if
(
neighborsInBuffer
>
0
)
{
int
tilesToStore
=
(
neighborsInBuffer
+
TILE_SIZE
-
1
)
/
TILE_SIZE
;
if
(
indexInWarp
==
0
)
tileStartIndex
=
atomicAdd
(
interactionCount
,
tilesToStore
);
tileStartIndex
=
atomicAdd
(
&
interactionCount
[
0
]
,
tilesToStore
);
int
newTileStartIndex
=
tileStartIndex
;
if
(
newTileStartIndex
+
tilesToStore
<=
maxTiles
)
{
if
(
indexInWarp
<
tilesToStore
)
...
...
@@ -277,4 +372,4 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
for
(
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
oldPositions
[
i
]
=
posq
[
i
];
}
\ No newline at end of file
}
platforms/cuda/src/kernels/gbsaObc1.cu
View file @
3b6925ae
...
...
@@ -264,7 +264,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
real4
posq1
=
posq
[
atom1
];
float2
params1
=
global_params
[
atom1
];
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
)
;
unsigned
int
j
=
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
];
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
...
...
@@ -400,7 +400,7 @@ typedef struct {
*/
extern
"C"
__global__
void
computeGBSAForce1
(
unsigned
long
long
*
__restrict__
forceBuffers
,
unsigned
long
long
*
__restrict__
global_bornForce
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
global_bornRadii
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
global_bornRadii
,
bool
needEnergy
,
#ifdef USE_CUTOFF
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
...
...
@@ -465,7 +465,8 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
if
(
atom1
!=
y
*
TILE_SIZE
+
j
)
tempEnergy
-=
scaledChargeProduct
/
CUTOFF
;
#endif
energy
+=
0.5
f
*
tempEnergy
;
if
(
needEnergy
)
energy
+=
0.5
f
*
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
...
...
@@ -519,7 +520,8 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
#ifdef USE_CUTOFF
tempEnergy
-=
scaledChargeProduct
/
CUTOFF
;
#endif
energy
+=
tempEnergy
;
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
...
...
@@ -617,7 +619,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
real4
posq1
=
posq
[
atom1
];
real
bornRadius1
=
global_bornRadii
[
atom1
];
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
)
;
unsigned
int
j
=
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
];
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
...
...
@@ -667,7 +669,8 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
#ifdef USE_CUTOFF
tempEnergy
-=
scaledChargeProduct
/
CUTOFF
;
#endif
energy
+=
tempEnergy
;
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
...
...
@@ -716,7 +719,8 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
#ifdef USE_CUTOFF
tempEnergy
-=
scaledChargeProduct
/
CUTOFF
;
#endif
energy
+=
tempEnergy
;
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
...
...
platforms/cuda/src/kernels/langevin.cu
View file @
3b6925ae
...
...
@@ -78,7 +78,7 @@ extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restric
* Select the step size to use for the next step.
*/
extern
"C"
__global__
void
selectLangevinStepSize
(
int
numAtoms
,
int
paddedNumAtoms
,
mixed
maxStepSize
,
mixed
errorTol
,
mixed
tau
,
mixed
kT
,
mixed2
*
__restrict__
dt
,
extern
"C"
__global__
void
selectLangevinStepSize
(
int
numAtoms
,
int
paddedNumAtoms
,
mixed
maxStepSize
,
mixed
errorTol
,
mixed
friction
,
mixed
kT
,
mixed2
*
__restrict__
dt
,
const
mixed4
*
__restrict__
velm
,
const
long
long
*
__restrict__
force
,
mixed
*
__restrict__
paramBuffer
)
{
// Calculate the error.
...
...
@@ -119,9 +119,9 @@ extern "C" __global__ void selectLangevinStepSize(int numAtoms, int paddedNumAto
// Recalculate the integration parameters.
mixed
vscale
=
EXP
(
-
newStepSize
/
tau
);
mixed
fscale
=
(
1
-
vscale
)
*
tau
;
mixed
noisescale
=
SQRT
(
2
*
kT
/
tau
)
*
SQRT
(
0.5
f
*
(
1
-
vscale
*
vscale
)
*
tau
);
mixed
vscale
=
exp
(
-
newStepSize
*
friction
);
mixed
fscale
=
(
friction
==
0
?
newStepSize
:
(
1
-
vscale
)
/
friction
)
;
mixed
noisescale
=
sqrt
(
kT
*
(
1
-
vscale
*
vscale
));
params
[
VelScale
]
=
vscale
;
params
[
ForceScale
]
=
fscale
;
params
[
NoiseScale
]
=
noisescale
;
...
...
platforms/cuda/src/kernels/nonbonded.cu
View file @
3b6925ae
...
...
@@ -103,9 +103,10 @@ extern "C" __global__ void computeNonbonded(
unsigned
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
tileflags
*
__restrict__
exclusions
,
const
ushort2
*
__restrict__
exclusionTiles
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
#ifdef USE_CUTOFF
,
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
,
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
,
unsigned
int
maxSinglePairs
,
const
int2
*
__restrict__
singlePairs
#endif
PARAMETER_ARGUMENTS
)
{
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
...
...
@@ -278,10 +279,10 @@ extern "C" __global__ void computeNonbonded(
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
#endif
#endif // end USE_SYMMETRIC
#endif
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
#endif
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
#endif
...
...
@@ -379,7 +380,7 @@ extern "C" __global__ void computeNonbonded(
LOAD_ATOM1_PARAMETERS
//const unsigned int localAtomIndex = threadIdx.x;
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
)
;
unsigned
int
j
=
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
];
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
...
...
@@ -484,9 +485,9 @@ extern "C" __global__ void computeNonbonded(
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
#endif
#endif // end USE_SYMMETRIC
#endif
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
#endif
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
...
...
@@ -555,9 +556,9 @@ extern "C" __global__ void computeNonbonded(
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
#endif
#endif // end USE_SYMMETRIC
#endif
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
#endif
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
...
...
@@ -588,6 +589,59 @@ extern "C" __global__ void computeNonbonded(
}
pos
++
;
}
// Third loop: single pairs that aren't part of a tile.
#if USE_CUTOFF
const
unsigned
int
numPairs
=
interactionCount
[
1
];
if
(
numPairs
>
maxSinglePairs
)
return
;
// There wasn't enough memory for the neighbor list.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
numPairs
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int2
pair
=
singlePairs
[
i
];
int
atom1
=
pair
.
x
;
int
atom2
=
pair
.
y
;
real4
posq1
=
posq
[
atom1
];
real4
posq2
=
posq
[
atom2
];
LOAD_ATOM1_PARAMETERS
int
j
=
atom2
;
atom2
=
threadIdx
.
x
;
DECLARE_LOCAL_PARAMETERS
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_ATOM2_PARAMETERS
atom2
=
pair
.
y
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
#ifdef USE_SYMMETRIC
real
dEdR
=
0.0
f
;
#else
real3
dEdR1
=
make_real3
(
0
);
real3
dEdR2
=
make_real3
(
0
);
#endif
bool
hasExclusions
=
false
;
bool
isExcluded
=
false
;
real
tempEnergy
=
0.0
f
;
const
real
interactionScale
=
1.0
f
;
COMPUTE_INTERACTION
energy
+=
tempEnergy
;
#ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC
real3
dEdR1
=
delta
*
dEdR
;
real3
dEdR2
=
-
dEdR1
;
#endif
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR1
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR1
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR1
.
z
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR2
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR2
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR2
.
z
*
0x100000000
)));
#endif
}
#endif
#ifdef INCLUDE_ENERGY
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
#endif
...
...
platforms/cuda/src/kernels/pme.cu
View file @
3b6925ae
...
...
@@ -21,7 +21,11 @@ extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int
extern
"C"
__global__
void
gridSpreadCharge
(
const
real4
*
__restrict__
posq
,
real
*
__restrict__
originalPmeGrid
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
const
int2
*
__restrict__
pmeAtomGridIndex
)
{
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
const
int2
*
__restrict__
pmeAtomGridIndex
#ifdef USE_LJPME
,
const
real2
*
__restrict__
sigmaEpsilon
#endif
)
{
real3
data
[
PME_ORDER
];
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
...
...
@@ -62,7 +66,13 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
data
[
0
]
=
scale
*
(
make_real3
(
1
)
-
dr
)
*
data
[
0
];
// Spread the charge from this atom onto each grid point.
#ifdef USE_LJPME
const
real2
sigEps
=
sigmaEpsilon
[
atom
];
const
real
charge
=
8
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
y
;
#else
const
real
charge
=
pos
.
w
;
#endif
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndex
.
x
+
ix
;
xbase
-=
(
xbase
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
...
...
@@ -80,7 +90,7 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
int
index
=
ybase
+
zindex
;
real
add
=
pos
.
w
*
dx
*
dy
*
data
[
iz
].
z
;
real
add
=
charge
*
dx
*
dy
*
data
[
iz
].
z
;
#ifdef USE_DOUBLE_PRECISION
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
originalPmeGrid
;
atomicAdd
(
&
ulonglong_p
[
index
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add
*
0x100000000
)));
...
...
@@ -121,7 +131,15 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict
real4
periodicBoxSize
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
// R2C stores into a half complex matrix where the last dimension is cut by half
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
(
GRID_SIZE_Z
/
2
+
1
);
#ifdef USE_LJPME
const
real
recipScaleFactor
=
-
2
*
M_PI
*
SQRT
(
M_PI
)
*
RECIP
(
6
*
periodicBoxSize
.
x
*
periodicBoxSize
.
y
*
periodicBoxSize
.
z
);
real
bfac
=
M_PI
/
EWALD_ALPHA
;
real
fac1
=
2
*
M_PI
*
M_PI
*
M_PI
*
SQRT
(
M_PI
);
real
fac2
=
EWALD_ALPHA
*
EWALD_ALPHA
*
EWALD_ALPHA
;
real
fac3
=
-
2
*
EWALD_ALPHA
*
M_PI
*
M_PI
;
#else
const
real
recipScaleFactor
=
RECIP
(
M_PI
*
periodicBoxSize
.
x
*
periodicBoxSize
.
y
*
periodicBoxSize
.
z
);
#endif
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// real indices
...
...
@@ -140,12 +158,23 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict
real
bz
=
pmeBsplineModuliZ
[
kz
];
real2
grid
=
halfcomplex_pmeGrid
[
index
];
real
m2
=
mhx
*
mhx
+
mhy
*
mhy
+
mhz
*
mhz
;
#ifdef USE_LJPME
real
denom
=
recipScaleFactor
/
(
bx
*
by
*
bz
);
real
m
=
SQRT
(
m2
);
real
m3
=
m
*
m2
;
real
b
=
bfac
*
m
;
real
expfac
=
-
b
*
b
;
real
expterm
=
EXP
(
expfac
);
real
erfcterm
=
ERFC
(
b
);
real
eterm
=
(
fac1
*
erfcterm
*
m3
+
expterm
*
(
fac2
+
fac3
*
m2
))
*
denom
;
halfcomplex_pmeGrid
[
index
]
=
make_real2
(
grid
.
x
*
eterm
,
grid
.
y
*
eterm
);
#else
real
denom
=
m2
*
bx
*
by
*
bz
;
real
eterm
=
recipScaleFactor
*
EXP
(
-
RECIP_EXP_FACTOR
*
m2
)
/
denom
;
if
(
kx
!=
0
||
ky
!=
0
||
kz
!=
0
)
{
halfcomplex_pmeGrid
[
index
]
=
make_real2
(
grid
.
x
*
eterm
,
grid
.
y
*
eterm
);
}
#endif
}
}
...
...
@@ -156,8 +185,16 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
real4
periodicBoxSize
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
// R2C stores into a half complex matrix where the last dimension is cut by half
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
#ifdef USE_LJPME
const
real
recipScaleFactor
=
-
2
*
M_PI
*
SQRT
(
M_PI
)
*
RECIP
(
6
*
periodicBoxSize
.
x
*
periodicBoxSize
.
y
*
periodicBoxSize
.
z
);
real
bfac
=
M_PI
/
EWALD_ALPHA
;
real
fac1
=
2
*
M_PI
*
M_PI
*
M_PI
*
SQRT
(
M_PI
);
real
fac2
=
EWALD_ALPHA
*
EWALD_ALPHA
*
EWALD_ALPHA
;
real
fac3
=
-
2
*
EWALD_ALPHA
*
M_PI
*
M_PI
;
#else
const
real
recipScaleFactor
=
RECIP
(
M_PI
*
periodicBoxSize
.
x
*
periodicBoxSize
.
y
*
periodicBoxSize
.
z
);
#endif
mixed
energy
=
0
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// real indices
...
...
@@ -175,8 +212,19 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
real
bx
=
pmeBsplineModuliX
[
kx
];
real
by
=
pmeBsplineModuliY
[
ky
];
real
bz
=
pmeBsplineModuliZ
[
kz
];
#ifdef USE_LJPME
real
denom
=
recipScaleFactor
/
(
bx
*
by
*
bz
);
real
m
=
SQRT
(
m2
);
real
m3
=
m
*
m2
;
real
b
=
bfac
*
m
;
real
expfac
=
-
b
*
b
;
real
expterm
=
EXP
(
expfac
);
real
erfcterm
=
ERFC
(
b
);
real
eterm
=
(
fac1
*
erfcterm
*
m3
+
expterm
*
(
fac2
+
fac3
*
m2
))
*
denom
;
#else
real
denom
=
m2
*
bx
*
by
*
bz
;
real
eterm
=
recipScaleFactor
*
EXP
(
-
RECIP_EXP_FACTOR
*
m2
)
/
denom
;
#endif
if
(
kz
>=
(
GRID_SIZE_Z
/
2
+
1
))
{
kx
=
((
kx
==
0
)
?
kx
:
GRID_SIZE_X
-
kx
);
...
...
@@ -185,9 +233,10 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
}
int
indexInHalfComplexGrid
=
kz
+
ky
*
(
GRID_SIZE_Z
/
2
+
1
)
+
kx
*
(
GRID_SIZE_Y
*
(
GRID_SIZE_Z
/
2
+
1
));
real2
grid
=
halfcomplex_pmeGrid
[
indexInHalfComplexGrid
];
if
(
kx
!=
0
||
ky
!=
0
||
kz
!=
0
)
{
#ifndef USE_LJPME
if
(
kx
!=
0
||
ky
!=
0
||
kz
!=
0
)
#endif
energy
+=
eterm
*
(
grid
.
x
*
grid
.
x
+
grid
.
y
*
grid
.
y
);
}
}
#ifdef USE_PME_STREAM
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
=
0.5
f
*
energy
;
...
...
@@ -199,7 +248,11 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
extern
"C"
__global__
void
gridInterpolateForce
(
const
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
forceBuffers
,
const
real
*
__restrict__
originalPmeGrid
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
const
int2
*
__restrict__
pmeAtomGridIndex
)
{
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
const
int2
*
__restrict__
pmeAtomGridIndex
#ifdef USE_LJPME
,
const
real2
*
__restrict__
sigmaEpsilon
#endif
)
{
real3
data
[
PME_ORDER
];
real3
ddata
[
PME_ORDER
];
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
...
...
@@ -271,7 +324,12 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
}
}
}
#ifdef USE_LJPME
const
real2
sigEps
=
sigmaEpsilon
[
atom
];
real
q
=
8
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
y
;
#else
real
q
=
pos
.
w
*
EPSILON_FACTOR
;
#endif
real
forceX
=
-
q
*
(
force
.
x
*
GRID_SIZE_X
*
recipBoxVecX
.
x
);
real
forceY
=
-
q
*
(
force
.
x
*
GRID_SIZE_X
*
recipBoxVecY
.
x
+
force
.
y
*
GRID_SIZE_Y
*
recipBoxVecY
.
y
);
real
forceZ
=
-
q
*
(
force
.
x
*
GRID_SIZE_X
*
recipBoxVecZ
.
x
+
force
.
y
*
GRID_SIZE_Y
*
recipBoxVecZ
.
y
+
force
.
z
*
GRID_SIZE_Z
*
recipBoxVecZ
.
z
);
...
...
platforms/cuda/src/kernels/sort.cu
View file @
3b6925ae
...
...
@@ -52,7 +52,8 @@ __global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length)
*/
__global__
void
computeRange
(
const
DATA_TYPE
*
__restrict__
data
,
unsigned
int
length
,
KEY_TYPE
*
__restrict__
range
,
unsigned
int
numBuckets
,
unsigned
int
*
__restrict__
bucketOffset
)
{
extern
__shared__
KEY_TYPE
rangeBuffer
[];
extern
__shared__
KEY_TYPE
minBuffer
[];
KEY_TYPE
*
maxBuffer
=
minBuffer
+
blockDim
.
x
;
KEY_TYPE
minimum
=
MAX_KEY
;
KEY_TYPE
maximum
=
MIN_KEY
;
...
...
@@ -66,23 +67,18 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le
// Now reduce them.
rangeBuffer
[
threadIdx
.
x
]
=
minimum
;
minBuffer
[
threadIdx
.
x
]
=
minimum
;
maxBuffer
[
threadIdx
.
x
]
=
maximum
;
__syncthreads
();
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
threadIdx
.
x
+
step
<
blockDim
.
x
&&
threadIdx
.
x
%
(
2
*
step
)
==
0
)
rangeBuffer
[
threadIdx
.
x
]
=
min
(
rangeBuffer
[
threadIdx
.
x
],
rangeBuffer
[
threadIdx
.
x
+
step
]);
__syncthreads
();
}
minimum
=
rangeBuffer
[
0
];
__syncthreads
();
rangeBuffer
[
threadIdx
.
x
]
=
maximum
;
__syncthreads
();
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
threadIdx
.
x
+
step
<
blockDim
.
x
&&
threadIdx
.
x
%
(
2
*
step
)
==
0
)
rangeBuffer
[
threadIdx
.
x
]
=
max
(
rangeBuffer
[
threadIdx
.
x
],
rangeBuffer
[
threadIdx
.
x
+
step
]);
if
(
threadIdx
.
x
+
step
<
blockDim
.
x
&&
threadIdx
.
x
%
(
2
*
step
)
==
0
)
{
minBuffer
[
threadIdx
.
x
]
=
min
(
minBuffer
[
threadIdx
.
x
],
minBuffer
[
threadIdx
.
x
+
step
]);
maxBuffer
[
threadIdx
.
x
]
=
max
(
maxBuffer
[
threadIdx
.
x
],
maxBuffer
[
threadIdx
.
x
+
step
]);
}
__syncthreads
();
}
maximum
=
rangeBuffer
[
0
];
minimum
=
minBuffer
[
0
];
maximum
=
maxBuffer
[
0
];
if
(
threadIdx
.
x
==
0
)
{
range
[
0
]
=
minimum
;
range
[
1
]
=
maximum
;
...
...
@@ -98,7 +94,7 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le
* Assign elements to buckets.
*/
__global__
void
assignElementsToBuckets
(
const
DATA_TYPE
*
__restrict__
data
,
unsigned
int
length
,
unsigned
int
numBuckets
,
const
KEY_TYPE
*
__restrict__
range
,
unsigned
int
*
bucketOffset
,
unsigned
int
*
__restrict__
bucketOfElement
,
unsigned
int
*
__restrict__
offsetInBucket
)
{
unsigned
int
*
__restrict__
bucketOffset
,
unsigned
int
*
__restrict__
bucketOfElement
,
unsigned
int
*
__restrict__
offsetInBucket
)
{
float
minValue
=
(
float
)
(
range
[
0
]);
float
maxValue
=
(
float
)
(
range
[
1
]);
float
bucketWidth
=
(
maxValue
-
minValue
)
/
numBuckets
;
...
...
platforms/opencl/include/OpenCLKernels.h
View file @
3b6925ae
...
...
@@ -607,13 +607,22 @@ public:
void
copyParametersToContext
(
ContextImpl
&
context
,
const
NonbondedForce
&
force
);
/**
* Get the parameters being used for PME.
*
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void
getPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
;
/**
* Get the parameters being used for the dispersion term in LJPME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void
getLJPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
;
private:
class
SortTrait
:
public
OpenCLSort
::
SortTrait
{
int
getDataSize
()
const
{
return
8
;}
...
...
@@ -664,8 +673,9 @@ private:
cl
::
Kernel
pmeInterpolateForceKernel
;
std
::
map
<
std
::
string
,
std
::
string
>
pmeDefines
;
std
::
vector
<
std
::
pair
<
int
,
int
>
>
exceptionAtoms
;
double
ewaldSelfEnergy
,
dispersionCoefficient
,
alpha
;
double
ewaldSelfEnergy
,
dispersionCoefficient
,
alpha
,
dispersionAlpha
;
int
gridSizeX
,
gridSizeY
,
gridSizeZ
;
int
dispersionGridSizeX
,
dispersionGridSizeY
,
dispersionGridSizeZ
;
bool
hasCoulomb
,
hasLJ
,
usePmeQueue
;
NonbondedMethod
nonbondedMethod
;
static
const
int
PmeOrder
=
5
;
...
...
@@ -1419,7 +1429,7 @@ private:
void
prepareForComputation
(
ContextImpl
&
context
,
CustomIntegrator
&
integrator
,
bool
&
forcesAreValid
);
Lepton
::
ExpressionTreeNode
replaceDerivFunctions
(
const
Lepton
::
ExpressionTreeNode
&
node
,
OpenMM
::
ContextImpl
&
context
);
void
findExpressionsForDerivs
(
const
Lepton
::
ExpressionTreeNode
&
node
,
std
::
vector
<
std
::
pair
<
Lepton
::
ExpressionTreeNode
,
std
::
string
>
>&
variableNodes
);
void
recordGlobalValue
(
double
value
,
GlobalTarget
target
);
void
recordGlobalValue
(
double
value
,
GlobalTarget
target
,
CustomIntegrator
&
integrator
);
void
recordChangedParameters
(
ContextImpl
&
context
);
bool
evaluateCondition
(
int
step
);
OpenCLContext
&
cl
;
...
...
platforms/opencl/include/OpenCLParallelKernels.h
View file @
3b6925ae
...
...
@@ -431,13 +431,22 @@ public:
void
copyParametersToContext
(
ContextImpl
&
context
,
const
NonbondedForce
&
force
);
/**
* Get the parameters being used for PME.
*
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void
getPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
;
/**
* Get the parameters being used for the dispersion term in LJPME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void
getLJPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
;
private:
class
Task
;
OpenCLPlatform
::
PlatformData
&
data
;
...
...
platforms/opencl/src/OpenCLKernels.cpp
View file @
3b6925ae
...
...
@@ -1733,7 +1733,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
else if (nonbondedMethod == PME) {
// Compute the PME parameters.
NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ);
NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ
, false
);
gridSizeX = OpenCLFFT3D::findLegalDimension(gridSizeX);
gridSizeY = OpenCLFFT3D::findLegalDimension(gridSizeY);
gridSizeZ = OpenCLFFT3D::findLegalDimension(gridSizeZ);
...
...
@@ -2086,8 +2086,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeEvalEnergyKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[2]);
}
if (includeEnergy)
cl.executeKernel(pmeEvalEnergyKernel,
cl.getNumAtoms()
);
cl.executeKernel(pmeConvolutionKernel,
cl.getNumAtoms()
);
cl.executeKernel(pmeEvalEnergyKernel,
gridSizeX*gridSizeY*gridSizeZ
);
cl.executeKernel(pmeConvolutionKernel,
gridSizeX*gridSizeY*gridSizeZ
);
fft->execFFT(*pmeGrid2, *pmeGrid, false);
setPeriodicBoxArgs(cl, pmeInterpolateForceKernel, 3);
if (cl.getUseDoublePrecision()) {
...
...
@@ -2205,6 +2205,20 @@ void OpenCLCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, in
}
}
void OpenCLCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
if (nonbondedMethod != LJPME)
throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
if (cl.getPlatformData().useCpuPme)
//cpuPme.getAs<CalcPmeReciprocalForceKernel>().getLJPMEParameters(alpha, nx, ny, nz);
throw OpenMMException("getPMEParametersInContext: CPUPME has not been implemented for LJPME yet.");
else {
alpha = this->dispersionAlpha;
nx = dispersionGridSizeX;
ny = dispersionGridSizeY;
nz = dispersionGridSizeZ;
}
}
class OpenCLCustomNonbondedForceInfo : public OpenCLForceInfo {
public:
OpenCLCustomNonbondedForceInfo(int requiredBuffers, const CustomNonbondedForce& force) : OpenCLForceInfo(requiredBuffers), force(force) {
...
...
@@ -2331,7 +2345,7 @@ void OpenCLCalcCustomNonbondedForceKernel::initialize(const System& system, cons
Lepton::ParsedExpression energyExpression = Lepton::Parser::parse(force.getEnergyFunction(), functions).optimize();
Lepton::ParsedExpression forceExpression = energyExpression.differentiate("r").optimize();
map<string, Lepton::ParsedExpression> forceExpressions;
forceExpressions["
temp
Energy
+
= "] = energyExpression;
forceExpressions["
real custom
Energy = "] = energyExpression;
forceExpressions["tempForce -= "] = forceExpression;
// Create the kernels.
...
...
@@ -2676,7 +2690,6 @@ double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool
interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
interactionGroupKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
interactionGroupKernel.setArg<cl::Buffer>(index++, interactionGroupData->getDeviceBuffer());
setPeriodicBoxArgs(cl, interactionGroupKernel, index);
index += 5;
for (int i = 0; i < (int) params->getBuffers().size(); i++)
interactionGroupKernel.setArg<cl::Memory>(index++, params->getBuffers()[i].getMemory());
...
...
@@ -2685,6 +2698,7 @@ double OpenCLCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool
if (globals != NULL)
interactionGroupKernel.setArg<cl::Buffer>(index++, globals->getDeviceBuffer());
}
setPeriodicBoxArgs(cl, interactionGroupKernel, 4);
int forceThreadBlockSize = max(32, cl.getNonbondedUtilities().getForceThreadBlockSize());
cl.executeKernel(interactionGroupKernel, numGroupThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
}
...
...
@@ -2872,6 +2886,7 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, bornRadii->getDeviceBuffer());
index++; // Whether to include energy.
if (nb.getUseCutoff()) {
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
...
...
@@ -2907,17 +2922,18 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
reduceBornForceKernel.setArg<cl::Buffer>(index++, bornRadii->getDeviceBuffer());
reduceBornForceKernel.setArg<cl::Buffer>(index++, obcChain->getDeviceBuffer());
}
force1Kernel.setArg<cl_int>(5, includeEnergy);
if (nb.getUseCutoff()) {
setPeriodicBoxArgs(cl, computeBornSumKernel, 5);
setPeriodicBoxArgs(cl, force1Kernel,
7
);
setPeriodicBoxArgs(cl, force1Kernel,
8
);
if (maxTiles < nb.getInteractingTiles().getSize()) {
maxTiles = nb.getInteractingTiles().getSize();
computeBornSumKernel.setArg<cl::Buffer>(3, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl_uint>(10, maxTiles);
computeBornSumKernel.setArg<cl::Buffer>(13, nb.getInteractingAtoms().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(
5
, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl_uint>(1
2
, maxTiles);
force1Kernel.setArg<cl::Buffer>(1
5
, nb.getInteractingAtoms().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(
6
, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl_uint>(1
3
, maxTiles);
force1Kernel.setArg<cl::Buffer>(1
6
, nb.getInteractingAtoms().getDeviceBuffer());
}
}
cl.executeKernel(computeBornSumKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
...
...
@@ -3933,6 +3949,7 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*4*elementSize, NULL);
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionTiles().getDeviceBuffer());
index++; // Whether to include energy.
if (nb.getUseCutoff()) {
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
...
...
@@ -4029,17 +4046,18 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
if (changed)
globals->upload(globalParamValues);
}
pairEnergyKernel.setArg<cl_int>(7, includeEnergy);
if (nb.getUseCutoff()) {
setPeriodicBoxArgs(cl, pairValueKernel, 8);
setPeriodicBoxArgs(cl, pairEnergyKernel,
9
);
setPeriodicBoxArgs(cl, pairEnergyKernel,
10
);
if (maxTiles < nb.getInteractingTiles().getSize()) {
maxTiles = nb.getInteractingTiles().getSize();
pairValueKernel.setArg<cl::Buffer>(6, nb.getInteractingTiles().getDeviceBuffer());
pairValueKernel.setArg<cl_uint>(13, maxTiles);
pairValueKernel.setArg<cl::Buffer>(16, nb.getInteractingAtoms().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(
7
, nb.getInteractingTiles().getDeviceBuffer());
pairEnergyKernel.setArg<cl_uint>(1
4
, maxTiles);
pairEnergyKernel.setArg<cl::Buffer>(1
7
, nb.getInteractingAtoms().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(
8
, nb.getInteractingTiles().getDeviceBuffer());
pairEnergyKernel.setArg<cl_uint>(1
5
, maxTiles);
pairEnergyKernel.setArg<cl::Buffer>(1
8
, nb.getInteractingAtoms().getDeviceBuffer());
}
}
cl.executeKernel(pairValueKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
...
...
@@ -6729,11 +6747,10 @@ void OpenCLIntegrateLangevinStepKernel::execute(ContextImpl& context, const Lang
if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
// Calculate the integration parameters.
double tau = (friction == 0.0 ? 0.0 : 1.0/friction);
double kT = BOLTZ*temperature;
double vscale = exp(-stepSize
/tau
);
double fscale = (
1-vscale)*tau
;
double noisescale = sqrt(
2*kT/tau)*sqrt(0.5
*(1-vscale*vscale)
*tau
);
double vscale = exp(-stepSize
*friction
);
double fscale = (
friction == 0 ? stepSize : (1-vscale)/friction)
;
double noisescale = sqrt(
kT
*(1-vscale*vscale));
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
vector<cl_double> p(params->getSize());
p[0] = vscale;
...
...
@@ -7011,13 +7028,13 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,
if (useDouble) {
selectSizeKernel.setArg<cl_double>(0, maxStepSize);
selectSizeKernel.setArg<cl_double>(1, integrator.getErrorTolerance());
selectSizeKernel.setArg<cl_double>(2, integrator.getFriction()
== 0.0 ? 0.0 : 1.0/integrator.getFriction()
);
selectSizeKernel.setArg<cl_double>(2, integrator.getFriction());
selectSizeKernel.setArg<cl_double>(3, BOLTZ*integrator.getTemperature());
}
else {
selectSizeKernel.setArg<cl_float>(0, maxStepSizeFloat);
selectSizeKernel.setArg<cl_float>(1, (cl_float) integrator.getErrorTolerance());
selectSizeKernel.setArg<cl_float>(2, (cl_float)
(
integrator.getFriction()
== 0.0 ? 0.0 : 1.0/integrator.getFriction())
);
selectSizeKernel.setArg<cl_float>(2, (cl_float) integrator.getFriction());
selectSizeKernel.setArg<cl_float>(3, (cl_float) (BOLTZ*integrator.getTemperature()));
}
cl.executeKernel(selectSizeKernel, blockSize, blockSize);
...
...
@@ -7368,8 +7385,10 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
for (int step = numSteps-1; step >= 0; step--) {
if (stepType[step] == CustomIntegrator::ConstrainPositions)
beforeConstrain = true;
else if (stepType[step] == CustomIntegrator::ComputePerDof && variable[step] == "x" && beforeConstrain)
else if (stepType[step] == CustomIntegrator::ComputePerDof && variable[step] == "x" && beforeConstrain)
{
storePosAsDelta[step] = true;
beforeConstrain = false;
}
}
bool storedAsDelta = false;
for (int step = 0; step < numSteps; step++) {
...
...
@@ -7384,7 +7403,7 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
// Identify steps that can be merged into a single kernel.
for (int step = 1; step < numSteps; step++) {
if
((needsForces[step] || needsEnergy[step]) &&
(invalidatesForces[step-1] || forceGroupFlags[step] != forceGroupFlags[step-1])
)
if (invalidatesForces[step-1] || forceGroupFlags[step] != forceGroupFlags[step-1])
continue;
if (stepType[step-1] == CustomIntegrator::ComputePerDof && stepType[step] == CustomIntegrator::ComputePerDof)
merged[step] = true;
...
...
@@ -7394,6 +7413,7 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
needsForces[step-1] = (needsForces[step] || needsForces[step-1]);
needsEnergy[step-1] = (needsEnergy[step] || needsEnergy[step-1]);
needsGlobals[step-1] = (needsGlobals[step] || needsGlobals[step-1]);
computeBothForceAndEnergy[step-1] = (computeBothForceAndEnergy[step] || computeBothForceAndEnergy[step-1]);
}
// Loop over all steps and create the kernels for them.
...
...
@@ -7564,7 +7584,7 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
kineticEnergyKernel.setArg<cl::Buffer>(index++, sumBuffer->getDeviceBuffer());
index += 2;
kineticEnergyKernel.setArg<cl::Buffer>(index++, uniformRandoms->getDeviceBuffer());
if (cl.getUseDoublePrecision())
if (cl.getUseDoublePrecision()
|| cl.getUseMixedPrecision()
)
kineticEnergyKernel.setArg<cl_double>(index++, 0.0);
else
kineticEnergyKernel.setArg<cl_float>(index++, 0.0f);
...
...
@@ -7594,7 +7614,7 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
}
localValuesAreCurrent = false;
double stepSize = integrator.getStepSize();
recordGlobalValue(stepSize, GlobalTarget(DT, dtVariableIndex));
recordGlobalValue(stepSize, GlobalTarget(DT, dtVariableIndex)
, integrator
);
for (int i = 0; i < (int) parameterNames.size(); i++) {
double value = context.getParameter(parameterNames[i]);
if (value != globalValuesDouble[parameterVariableIndex[i]]) {
...
...
@@ -7713,7 +7733,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
kernels[step][0].setArg<cl_uint>(9, integration.prepareRandomNumbers(requiredGaussian[step]));
kernels[step][0].setArg<cl::Buffer>(8, integration.getRandom().getDeviceBuffer());
kernels[step][0].setArg<cl::Buffer>(10, uniformRandoms->getDeviceBuffer());
if (cl.getUseDoublePrecision())
if (cl.getUseDoublePrecision()
|| cl.getUseMixedPrecision()
)
kernels[step][0].setArg<cl_double>(11, energy);
else
kernels[step][0].setArg<cl_float>(11, (cl_float) energy);
...
...
@@ -7725,13 +7745,13 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
expressionSet.setVariable(uniformVariableIndex, SimTKOpenMMUtilities::getUniformlyDistributedRandomNumber());
expressionSet.setVariable(gaussianVariableIndex, SimTKOpenMMUtilities::getNormallyDistributedRandomNumber());
expressionSet.setVariable(stepEnergyVariableIndex[step], energy);
recordGlobalValue(globalExpressions[step][0].evaluate(), stepTarget[step]);
recordGlobalValue(globalExpressions[step][0].evaluate(), stepTarget[step]
, integrator
);
}
else if (stepType[step] == CustomIntegrator::ComputeSum) {
kernels[step][0].setArg<cl_uint>(9, integration.prepareRandomNumbers(requiredGaussian[step]));
kernels[step][0].setArg<cl::Buffer>(8, integration.getRandom().getDeviceBuffer());
kernels[step][0].setArg<cl::Buffer>(10, uniformRandoms->getDeviceBuffer());
if (cl.getUseDoublePrecision())
if (cl.getUseDoublePrecision()
|| cl.getUseMixedPrecision()
)
kernels[step][0].setArg<cl_double>(11, energy);
else
kernels[step][0].setArg<cl_float>(11, (cl_float) energy);
...
...
@@ -7743,12 +7763,12 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
double value;
summedValue->download(&value);
recordGlobalValue(value, stepTarget[step]);
recordGlobalValue(value, stepTarget[step]
, integrator
);
}
else {
float value;
summedValue->download(&value);
recordGlobalValue(value, stepTarget[step]);
recordGlobalValue(value, stepTarget[step]
, integrator
);
}
}
else if (stepType[step] == CustomIntegrator::UpdateContextState) {
...
...
@@ -7852,7 +7872,7 @@ double OpenCLIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& contex
}
}
void OpenCLIntegrateCustomStepKernel::recordGlobalValue(double value, GlobalTarget target) {
void OpenCLIntegrateCustomStepKernel::recordGlobalValue(double value, GlobalTarget target
, CustomIntegrator& integrator
) {
switch (target.type) {
case DT:
if (value != globalValuesDouble[dtVariableIndex])
...
...
@@ -7860,6 +7880,7 @@ void OpenCLIntegrateCustomStepKernel::recordGlobalValue(double value, GlobalTarg
expressionSet.setVariable(dtVariableIndex, value);
globalValuesDouble[dtVariableIndex] = value;
cl.getIntegrationUtilities().setNextStepSize(value);
integrator.setStepSize(value);
break;
case VARIABLE:
case PARAMETER:
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
3b6925ae
...
...
@@ -511,6 +511,8 @@ void OpenCLNonbondedUtilities::createKernelsForGroups(int groups) {
defines
[
"SIMD_WIDTH"
]
=
context
.
intToString
(
context
.
getSIMDWidth
());
if
(
usePeriodic
)
defines
[
"USE_PERIODIC"
]
=
"1"
;
if
(
context
.
getBoxIsTriclinic
())
defines
[
"TRICLINIC"
]
=
"1"
;
defines
[
"MAX_EXCLUSIONS"
]
=
context
.
intToString
(
maxExclusions
);
defines
[
"BUFFER_GROUPS"
]
=
(
deviceIsCpu
?
"4"
:
"2"
);
string
file
=
(
deviceIsCpu
?
OpenCLKernelSources
::
findInteractingBlocks_cpu
:
OpenCLKernelSources
::
findInteractingBlocks
);
...
...
platforms/opencl/src/OpenCLParallelKernels.cpp
View file @
3b6925ae
...
...
@@ -583,6 +583,10 @@ void OpenCLParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int
dynamic_cast
<
const
OpenCLCalcNonbondedForceKernel
&>
(
kernels
[
0
].
getImpl
()).
getPMEParameters
(
alpha
,
nx
,
ny
,
nz
);
}
void
OpenCLParallelCalcNonbondedForceKernel
::
getLJPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
{
dynamic_cast
<
const
OpenCLCalcNonbondedForceKernel
&>
(
kernels
[
0
].
getImpl
()).
getLJPMEParameters
(
alpha
,
nx
,
ny
,
nz
);
}
class
OpenCLParallelCalcCustomNonbondedForceKernel
::
Task
:
public
OpenCLContext
::
WorkTask
{
public:
Task
(
ContextImpl
&
context
,
OpenCLCalcCustomNonbondedForceKernel
&
kernel
,
bool
includeForce
,
...
...
Prev
1
2
3
4
5
6
7
8
9
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment