Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
3b6925ae
Commit
3b6925ae
authored
Jan 26, 2017
by
Andy Simmonett
Committed by
GitHub
Jan 26, 2017
Browse files
Merge pull request #1 from peastman/ljpme
Cleanup to LJ PME code
parents
5a8a8aa9
f7a102fb
Changes
193
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
487 additions
and
150 deletions
+487
-150
platforms/cuda/src/CudaNonbondedUtilities.cpp
platforms/cuda/src/CudaNonbondedUtilities.cpp
+48
-23
platforms/cuda/src/CudaParallelKernels.cpp
platforms/cuda/src/CudaParallelKernels.cpp
+19
-14
platforms/cuda/src/CudaPlatform.cpp
platforms/cuda/src/CudaPlatform.cpp
+4
-0
platforms/cuda/src/CudaSort.cpp
platforms/cuda/src/CudaSort.cpp
+2
-2
platforms/cuda/src/kernels/coulombLennardJones.cu
platforms/cuda/src/kernels/coulombLennardJones.cu
+45
-1
platforms/cuda/src/kernels/customGBEnergyN2.cu
platforms/cuda/src/kernels/customGBEnergyN2.cu
+10
-6
platforms/cuda/src/kernels/customGBValueN2.cu
platforms/cuda/src/kernels/customGBValueN2.cu
+1
-1
platforms/cuda/src/kernels/customIntegratorPerDof.cu
platforms/cuda/src/kernels/customIntegratorPerDof.cu
+1
-1
platforms/cuda/src/kernels/customNonbonded.cu
platforms/cuda/src/kernels/customNonbonded.cu
+4
-2
platforms/cuda/src/kernels/findInteractingBlocks.cu
platforms/cuda/src/kernels/findInteractingBlocks.cu
+122
-27
platforms/cuda/src/kernels/gbsaObc1.cu
platforms/cuda/src/kernels/gbsaObc1.cu
+11
-7
platforms/cuda/src/kernels/langevin.cu
platforms/cuda/src/kernels/langevin.cu
+4
-4
platforms/cuda/src/kernels/nonbonded.cu
platforms/cuda/src/kernels/nonbonded.cu
+60
-6
platforms/cuda/src/kernels/pme.cu
platforms/cuda/src/kernels/pme.cu
+66
-8
platforms/cuda/src/kernels/sort.cu
platforms/cuda/src/kernels/sort.cu
+11
-15
platforms/opencl/include/OpenCLKernels.h
platforms/opencl/include/OpenCLKernels.h
+13
-3
platforms/opencl/include/OpenCLParallelKernels.h
platforms/opencl/include/OpenCLParallelKernels.h
+10
-1
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+50
-29
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+2
-0
platforms/opencl/src/OpenCLParallelKernels.cpp
platforms/opencl/src/OpenCLParallelKernels.cpp
+4
-0
No files found.
platforms/cuda/src/CudaNonbondedUtilities.cpp
View file @
3b6925ae
...
@@ -64,15 +64,16 @@ private:
...
@@ -64,15 +64,16 @@ private:
CudaNonbondedUtilities
::
CudaNonbondedUtilities
(
CudaContext
&
context
)
:
context
(
context
),
useCutoff
(
false
),
usePeriodic
(
false
),
anyExclusions
(
false
),
usePadding
(
true
),
CudaNonbondedUtilities
::
CudaNonbondedUtilities
(
CudaContext
&
context
)
:
context
(
context
),
useCutoff
(
false
),
usePeriodic
(
false
),
anyExclusions
(
false
),
usePadding
(
true
),
exclusionIndices
(
NULL
),
exclusionRowIndices
(
NULL
),
exclusionTiles
(
NULL
),
exclusions
(
NULL
),
interactingTiles
(
NULL
),
interactingAtoms
(
NULL
),
exclusionIndices
(
NULL
),
exclusionRowIndices
(
NULL
),
exclusionTiles
(
NULL
),
exclusions
(
NULL
),
interactingTiles
(
NULL
),
interactingAtoms
(
NULL
),
interactionCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
),
sortedBlocks
(
NULL
),
sortedBlockCenter
(
NULL
),
sortedBlockBoundingBox
(
NULL
),
interactionCount
(
NULL
),
singlePairs
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
),
sortedBlocks
(
NULL
),
sortedBlockCenter
(
NULL
),
sortedBlockBoundingBox
(
NULL
),
oldPositions
(
NULL
),
rebuildNeighborList
(
NULL
),
blockSorter
(
NULL
),
pinnedCountBuffer
(
NULL
),
forceRebuildNeighborList
(
true
),
lastCutoff
(
0.0
),
groupFlags
(
0
)
{
oldPositions
(
NULL
),
rebuildNeighborList
(
NULL
),
blockSorter
(
NULL
),
pinnedCountBuffer
(
NULL
),
forceRebuildNeighborList
(
true
),
lastCutoff
(
0.0
),
groupFlags
(
0
),
canUsePairList
(
true
)
{
// Decide how many thread blocks to use.
// Decide how many thread blocks to use.
string
errorMessage
=
"Error initializing nonbonded utilities"
;
string
errorMessage
=
"Error initializing nonbonded utilities"
;
int
multiprocessors
;
int
multiprocessors
;
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
context
.
getDevice
()));
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
context
.
getDevice
()));
CHECK_RESULT
(
cuEventCreate
(
&
downloadCountEvent
,
0
));
CHECK_RESULT
(
cuEventCreate
(
&
downloadCountEvent
,
0
));
CHECK_RESULT
(
cuMemHostAlloc
((
void
**
)
&
pinnedCountBuffer
,
sizeof
(
int
),
CU_MEMHOSTALLOC_PORTABLE
));
CHECK_RESULT
(
cuMemHostAlloc
((
void
**
)
&
pinnedCountBuffer
,
2
*
sizeof
(
int
),
CU_MEMHOSTALLOC_PORTABLE
));
numForceThreadBlocks
=
4
*
multiprocessors
;
numForceThreadBlocks
=
4
*
multiprocessors
;
forceThreadBlockSize
=
(
context
.
getComputeCapability
()
<
2.0
?
128
:
256
);
forceThreadBlockSize
=
(
context
.
getComputeCapability
()
<
2.0
?
128
:
256
);
}
}
...
@@ -92,6 +93,8 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
...
@@ -92,6 +93,8 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
delete
interactingAtoms
;
delete
interactingAtoms
;
if
(
interactionCount
!=
NULL
)
if
(
interactionCount
!=
NULL
)
delete
interactionCount
;
delete
interactionCount
;
if
(
singlePairs
!=
NULL
)
delete
singlePairs
;
if
(
blockCenter
!=
NULL
)
if
(
blockCenter
!=
NULL
)
delete
blockCenter
;
delete
blockCenter
;
if
(
blockBoundingBox
!=
NULL
)
if
(
blockBoundingBox
!=
NULL
)
...
@@ -113,7 +116,7 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
...
@@ -113,7 +116,7 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
cuEventDestroy
(
downloadCountEvent
);
cuEventDestroy
(
downloadCountEvent
);
}
}
void
CudaNonbondedUtilities
::
addInteraction
(
bool
usesCutoff
,
bool
usesPeriodic
,
bool
usesExclusions
,
double
cutoffDistance
,
const
vector
<
vector
<
int
>
>&
exclusionList
,
const
string
&
kernel
,
int
forceGroup
)
{
void
CudaNonbondedUtilities
::
addInteraction
(
bool
usesCutoff
,
bool
usesPeriodic
,
bool
usesExclusions
,
double
cutoffDistance
,
const
vector
<
vector
<
int
>
>&
exclusionList
,
const
string
&
kernel
,
int
forceGroup
,
bool
supportsPairList
)
{
if
(
groupCutoff
.
size
()
>
0
)
{
if
(
groupCutoff
.
size
()
>
0
)
{
if
(
usesCutoff
!=
useCutoff
)
if
(
usesCutoff
!=
useCutoff
)
throw
OpenMMException
(
"All Forces must agree on whether to use a cutoff"
);
throw
OpenMMException
(
"All Forces must agree on whether to use a cutoff"
);
...
@@ -128,6 +131,7 @@ void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic,
...
@@ -128,6 +131,7 @@ void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic,
usePeriodic
=
usesPeriodic
;
usePeriodic
=
usesPeriodic
;
groupCutoff
[
forceGroup
]
=
cutoffDistance
;
groupCutoff
[
forceGroup
]
=
cutoffDistance
;
groupFlags
|=
1
<<
forceGroup
;
groupFlags
|=
1
<<
forceGroup
;
canUsePairList
&=
supportsPairList
;
if
(
kernel
.
size
()
>
0
)
{
if
(
kernel
.
size
()
>
0
)
{
if
(
groupKernelSource
.
find
(
forceGroup
)
==
groupKernelSource
.
end
())
if
(
groupKernelSource
.
find
(
forceGroup
)
==
groupKernelSource
.
end
())
groupKernelSource
[
forceGroup
]
=
""
;
groupKernelSource
[
forceGroup
]
=
""
;
...
@@ -279,9 +283,11 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -279,9 +283,11 @@ void CudaNonbondedUtilities::initialize(const System& system) {
maxTiles
=
numTiles
;
maxTiles
=
numTiles
;
if
(
maxTiles
<
1
)
if
(
maxTiles
<
1
)
maxTiles
=
1
;
maxTiles
=
1
;
maxSinglePairs
=
5
*
numAtoms
;
interactingTiles
=
CudaArray
::
create
<
int
>
(
context
,
maxTiles
,
"interactingTiles"
);
interactingTiles
=
CudaArray
::
create
<
int
>
(
context
,
maxTiles
,
"interactingTiles"
);
interactingAtoms
=
CudaArray
::
create
<
int
>
(
context
,
CudaContext
::
TileSize
*
maxTiles
,
"interactingAtoms"
);
interactingAtoms
=
CudaArray
::
create
<
int
>
(
context
,
CudaContext
::
TileSize
*
maxTiles
,
"interactingAtoms"
);
interactionCount
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
1
,
"interactionCount"
);
interactionCount
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
2
,
"interactionCount"
);
singlePairs
=
CudaArray
::
create
<
int2
>
(
context
,
maxSinglePairs
,
"singlePairs"
);
int
elementSize
=
(
context
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
int
elementSize
=
(
context
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
blockCenter
=
new
CudaArray
(
context
,
numAtomBlocks
,
4
*
elementSize
,
"blockCenter"
);
blockCenter
=
new
CudaArray
(
context
,
numAtomBlocks
,
4
*
elementSize
,
"blockCenter"
);
blockBoundingBox
=
new
CudaArray
(
context
,
numAtomBlocks
,
4
*
elementSize
,
"blockBoundingBox"
);
blockBoundingBox
=
new
CudaArray
(
context
,
numAtomBlocks
,
4
*
elementSize
,
"blockBoundingBox"
);
...
@@ -291,7 +297,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -291,7 +297,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
oldPositions
=
new
CudaArray
(
context
,
numAtoms
,
4
*
elementSize
,
"oldPositions"
);
oldPositions
=
new
CudaArray
(
context
,
numAtoms
,
4
*
elementSize
,
"oldPositions"
);
rebuildNeighborList
=
CudaArray
::
create
<
int
>
(
context
,
1
,
"rebuildNeighborList"
);
rebuildNeighborList
=
CudaArray
::
create
<
int
>
(
context
,
1
,
"rebuildNeighborList"
);
blockSorter
=
new
CudaSort
(
context
,
new
BlockSortTrait
(
context
.
getUseDoublePrecision
()),
numAtomBlocks
);
blockSorter
=
new
CudaSort
(
context
,
new
BlockSortTrait
(
context
.
getUseDoublePrecision
()),
numAtomBlocks
);
vector
<
unsigned
int
>
count
(
1
,
0
);
vector
<
unsigned
int
>
count
(
2
,
0
);
interactionCount
->
upload
(
count
);
interactionCount
->
upload
(
count
);
}
}
...
@@ -316,6 +322,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -316,6 +322,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
forceArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
forceArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
forceArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
forceArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
forceArgs
.
push_back
(
&
interactingAtoms
->
getDevicePointer
());
forceArgs
.
push_back
(
&
interactingAtoms
->
getDevicePointer
());
forceArgs
.
push_back
(
&
maxSinglePairs
);
forceArgs
.
push_back
(
&
singlePairs
->
getDevicePointer
());
}
}
for
(
int
i
=
0
;
i
<
(
int
)
parameters
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
(
int
)
parameters
.
size
();
i
++
)
forceArgs
.
push_back
(
&
parameters
[
i
].
getMemory
());
forceArgs
.
push_back
(
&
parameters
[
i
].
getMemory
());
...
@@ -353,8 +361,10 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -353,8 +361,10 @@ void CudaNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactingTiles
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactingTiles
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactingAtoms
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactingAtoms
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
singlePairs
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
maxTiles
);
findInteractingBlocksArgs
.
push_back
(
&
maxTiles
);
findInteractingBlocksArgs
.
push_back
(
&
maxSinglePairs
);
findInteractingBlocksArgs
.
push_back
(
&
startBlockIndex
);
findInteractingBlocksArgs
.
push_back
(
&
startBlockIndex
);
findInteractingBlocksArgs
.
push_back
(
&
numBlocks
);
findInteractingBlocksArgs
.
push_back
(
&
numBlocks
);
findInteractingBlocksArgs
.
push_back
(
&
sortedBlocks
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
sortedBlocks
->
getDevicePointer
());
...
@@ -424,28 +434,39 @@ void CudaNonbondedUtilities::computeInteractions(int forceGroups, bool includeFo
...
@@ -424,28 +434,39 @@ void CudaNonbondedUtilities::computeInteractions(int forceGroups, bool includeFo
bool
CudaNonbondedUtilities
::
updateNeighborListSize
()
{
bool
CudaNonbondedUtilities
::
updateNeighborListSize
()
{
if
(
!
useCutoff
)
if
(
!
useCutoff
)
return
false
;
return
false
;
if
(
pinnedCountBuffer
[
0
]
<=
(
unsigned
int
)
maxTile
s
)
if
(
pinnedCountBuffer
[
0
]
<=
maxTiles
&&
pinnedCountBuffer
[
1
]
<=
maxSinglePair
s
)
return
false
;
return
false
;
// The most recent timestep had too many interactions to fit in the arrays. Make the arrays bigger to prevent
// The most recent timestep had too many interactions to fit in the arrays. Make the arrays bigger to prevent
// this from happening in the future.
// this from happening in the future.
maxTiles
=
(
int
)
(
1.2
*
pinnedCountBuffer
[
0
]);
if
(
pinnedCountBuffer
[
0
]
>
maxTiles
)
{
int
totalTiles
=
context
.
getNumAtomBlocks
()
*
(
context
.
getNumAtomBlocks
()
+
1
)
/
2
;
maxTiles
=
(
int
)
(
1.2
*
pinnedCountBuffer
[
0
]);
if
(
maxTiles
>
totalTiles
)
int
totalTiles
=
context
.
getNumAtomBlocks
()
*
(
context
.
getNumAtomBlocks
()
+
1
)
/
2
;
maxTiles
=
totalTiles
;
if
(
maxTiles
>
totalTiles
)
delete
interactingTiles
;
maxTiles
=
totalTiles
;
delete
interactingAtoms
;
delete
interactingTiles
;
interactingTiles
=
NULL
;
// Avoid an error in the destructor if the following allocation fails
delete
interactingAtoms
;
interactingAtoms
=
NULL
;
interactingTiles
=
NULL
;
// Avoid an error in the destructor if the following allocation fails
interactingTiles
=
CudaArray
::
create
<
int
>
(
context
,
maxTiles
,
"interactingTiles"
);
interactingAtoms
=
NULL
;
interactingAtoms
=
CudaArray
::
create
<
int
>
(
context
,
CudaContext
::
TileSize
*
maxTiles
,
"interactingAtoms"
);
interactingTiles
=
CudaArray
::
create
<
int
>
(
context
,
maxTiles
,
"interactingTiles"
);
if
(
forceArgs
.
size
()
>
0
)
interactingAtoms
=
CudaArray
::
create
<
int
>
(
context
,
CudaContext
::
TileSize
*
maxTiles
,
"interactingAtoms"
);
forceArgs
[
7
]
=
&
interactingTiles
->
getDevicePointer
();
if
(
forceArgs
.
size
()
>
0
)
findInteractingBlocksArgs
[
6
]
=
&
interactingTiles
->
getDevicePointer
();
forceArgs
[
7
]
=
&
interactingTiles
->
getDevicePointer
();
if
(
forceArgs
.
size
()
>
0
)
findInteractingBlocksArgs
[
6
]
=
&
interactingTiles
->
getDevicePointer
();
forceArgs
[
17
]
=
&
interactingAtoms
->
getDevicePointer
();
if
(
forceArgs
.
size
()
>
0
)
findInteractingBlocksArgs
[
7
]
=
&
interactingAtoms
->
getDevicePointer
();
forceArgs
[
17
]
=
&
interactingAtoms
->
getDevicePointer
();
findInteractingBlocksArgs
[
7
]
=
&
interactingAtoms
->
getDevicePointer
();
}
if
(
pinnedCountBuffer
[
1
]
>
maxSinglePairs
)
{
maxSinglePairs
=
(
int
)
(
1.2
*
pinnedCountBuffer
[
1
]);
delete
singlePairs
;
singlePairs
=
NULL
;
// Avoid an error in the destructor if the following allocation fails
singlePairs
=
CudaArray
::
create
<
int2
>
(
context
,
maxSinglePairs
,
"singlePairs"
);
if
(
forceArgs
.
size
()
>
0
)
forceArgs
[
19
]
=
&
singlePairs
->
getDevicePointer
();
findInteractingBlocksArgs
[
8
]
=
&
singlePairs
->
getDevicePointer
();
}
forceRebuildNeighborList
=
true
;
forceRebuildNeighborList
=
true
;
context
.
setForcesValid
(
false
);
context
.
setForcesValid
(
false
);
return
true
;
return
true
;
...
@@ -492,7 +513,11 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) {
...
@@ -492,7 +513,11 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) {
defines
[
"NUM_TILES_WITH_EXCLUSIONS"
]
=
context
.
intToString
(
exclusionTiles
->
getSize
());
defines
[
"NUM_TILES_WITH_EXCLUSIONS"
]
=
context
.
intToString
(
exclusionTiles
->
getSize
());
if
(
usePeriodic
)
if
(
usePeriodic
)
defines
[
"USE_PERIODIC"
]
=
"1"
;
defines
[
"USE_PERIODIC"
]
=
"1"
;
if
(
context
.
getBoxIsTriclinic
())
defines
[
"TRICLINIC"
]
=
"1"
;
defines
[
"MAX_EXCLUSIONS"
]
=
context
.
intToString
(
maxExclusions
);
defines
[
"MAX_EXCLUSIONS"
]
=
context
.
intToString
(
maxExclusions
);
// Temporarily disable the pair list until we figure out why it's failing on some GPUs.
defines
[
"MAX_BITS_FOR_PAIRS"
]
=
"0"
;
//(canUsePairList ? "2" : "0");
CUmodule
interactingBlocksProgram
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
findInteractingBlocks
,
defines
);
CUmodule
interactingBlocksProgram
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
findInteractingBlocks
,
defines
);
kernels
.
findBlockBoundsKernel
=
context
.
getKernel
(
interactingBlocksProgram
,
"findBlockBounds"
);
kernels
.
findBlockBoundsKernel
=
context
.
getKernel
(
interactingBlocksProgram
,
"findBlockBounds"
);
kernels
.
sortBoxDataKernel
=
context
.
getKernel
(
interactingBlocksProgram
,
"sortBoxData"
);
kernels
.
sortBoxDataKernel
=
context
.
getKernel
(
interactingBlocksProgram
,
"sortBoxData"
);
...
...
platforms/cuda/src/CudaParallelKernels.cpp
View file @
3b6925ae
...
@@ -63,8 +63,8 @@ if (result != CUDA_SUCCESS) { \
...
@@ -63,8 +63,8 @@ if (result != CUDA_SUCCESS) { \
class
CudaParallelCalcForcesAndEnergyKernel
::
BeginComputationTask
:
public
CudaContext
::
WorkTask
{
class
CudaParallelCalcForcesAndEnergyKernel
::
BeginComputationTask
:
public
CudaContext
::
WorkTask
{
public:
public:
BeginComputationTask
(
ContextImpl
&
context
,
CudaContext
&
cu
,
CudaCalcForcesAndEnergyKernel
&
kernel
,
BeginComputationTask
(
ContextImpl
&
context
,
CudaContext
&
cu
,
CudaCalcForcesAndEnergyKernel
&
kernel
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
void
*
pinnedMemory
,
CUevent
event
,
int
&
numTiles
)
:
context
(
context
),
cu
(
cu
),
kernel
(
kernel
),
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
void
*
pinnedMemory
,
CUevent
event
,
int
2
&
interactionCount
)
:
context
(
context
),
cu
(
cu
),
kernel
(
kernel
),
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
groups
(
groups
),
pinnedMemory
(
pinnedMemory
),
event
(
event
),
numTiles
(
numTiles
)
{
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
groups
(
groups
),
pinnedMemory
(
pinnedMemory
),
event
(
event
),
interactionCount
(
interactionCount
)
{
}
}
void
execute
()
{
void
execute
()
{
// Copy coordinates over to this device and execute the kernel.
// Copy coordinates over to this device and execute the kernel.
...
@@ -77,7 +77,7 @@ public:
...
@@ -77,7 +77,7 @@ public:
}
}
kernel
.
beginComputation
(
context
,
includeForce
,
includeEnergy
,
groups
);
kernel
.
beginComputation
(
context
,
includeForce
,
includeEnergy
,
groups
);
if
(
cu
.
getNonbondedUtilities
().
getUsePeriodic
())
if
(
cu
.
getNonbondedUtilities
().
getUsePeriodic
())
cu
.
getNonbondedUtilities
().
getInteractionCount
().
download
(
&
numTiles
,
false
);
cu
.
getNonbondedUtilities
().
getInteractionCount
().
download
(
&
interactionCount
,
false
);
}
}
private:
private:
ContextImpl
&
context
;
ContextImpl
&
context
;
...
@@ -87,15 +87,15 @@ private:
...
@@ -87,15 +87,15 @@ private:
int
groups
;
int
groups
;
void
*
pinnedMemory
;
void
*
pinnedMemory
;
CUevent
event
;
CUevent
event
;
int
&
numTiles
;
int
2
&
interactionCount
;
};
};
class
CudaParallelCalcForcesAndEnergyKernel
::
FinishComputationTask
:
public
CudaContext
::
WorkTask
{
class
CudaParallelCalcForcesAndEnergyKernel
::
FinishComputationTask
:
public
CudaContext
::
WorkTask
{
public:
public:
FinishComputationTask
(
ContextImpl
&
context
,
CudaContext
&
cu
,
CudaCalcForcesAndEnergyKernel
&
kernel
,
FinishComputationTask
(
ContextImpl
&
context
,
CudaContext
&
cu
,
CudaCalcForcesAndEnergyKernel
&
kernel
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
double
&
energy
,
long
long
&
completionTime
,
long
long
*
pinnedMemory
,
CudaArray
&
contextForces
,
bool
&
valid
,
int
&
numTiles
)
:
bool
includeForce
,
bool
includeEnergy
,
int
groups
,
double
&
energy
,
long
long
&
completionTime
,
long
long
*
pinnedMemory
,
CudaArray
&
contextForces
,
bool
&
valid
,
int
2
&
interactionCount
)
:
context
(
context
),
cu
(
cu
),
kernel
(
kernel
),
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
groups
(
groups
),
energy
(
energy
),
context
(
context
),
cu
(
cu
),
kernel
(
kernel
),
includeForce
(
includeForce
),
includeEnergy
(
includeEnergy
),
groups
(
groups
),
energy
(
energy
),
completionTime
(
completionTime
),
pinnedMemory
(
pinnedMemory
),
contextForces
(
contextForces
),
valid
(
valid
),
numTiles
(
numTiles
)
{
completionTime
(
completionTime
),
pinnedMemory
(
pinnedMemory
),
contextForces
(
contextForces
),
valid
(
valid
),
interactionCount
(
interactionCount
)
{
}
}
void
execute
()
{
void
execute
()
{
// Execute the kernel, then download forces.
// Execute the kernel, then download forces.
...
@@ -120,7 +120,8 @@ public:
...
@@ -120,7 +120,8 @@ public:
cu
.
getForce
().
download
(
&
pinnedMemory
[(
cu
.
getContextIndex
()
-
1
)
*
numAtoms
*
3
]);
cu
.
getForce
().
download
(
&
pinnedMemory
[(
cu
.
getContextIndex
()
-
1
)
*
numAtoms
*
3
]);
}
}
}
}
if
(
cu
.
getNonbondedUtilities
().
getUsePeriodic
()
&&
numTiles
>
cu
.
getNonbondedUtilities
().
getInteractingTiles
().
getSize
())
{
if
(
cu
.
getNonbondedUtilities
().
getUsePeriodic
()
&&
(
interactionCount
.
x
>
cu
.
getNonbondedUtilities
().
getInteractingTiles
().
getSize
()
||
interactionCount
.
y
>
cu
.
getNonbondedUtilities
().
getSinglePairs
().
getSize
()))
{
valid
=
false
;
valid
=
false
;
cu
.
getNonbondedUtilities
().
updateNeighborListSize
();
cu
.
getNonbondedUtilities
().
updateNeighborListSize
();
}
}
...
@@ -136,12 +137,12 @@ private:
...
@@ -136,12 +137,12 @@ private:
long
long
*
pinnedMemory
;
long
long
*
pinnedMemory
;
CudaArray
&
contextForces
;
CudaArray
&
contextForces
;
bool
&
valid
;
bool
&
valid
;
int
&
numTiles
;
int
2
&
interactionCount
;
};
};
CudaParallelCalcForcesAndEnergyKernel
::
CudaParallelCalcForcesAndEnergyKernel
(
string
name
,
const
Platform
&
platform
,
CudaPlatform
::
PlatformData
&
data
)
:
CudaParallelCalcForcesAndEnergyKernel
::
CudaParallelCalcForcesAndEnergyKernel
(
string
name
,
const
Platform
&
platform
,
CudaPlatform
::
PlatformData
&
data
)
:
CalcForcesAndEnergyKernel
(
name
,
platform
),
data
(
data
),
completionTimes
(
data
.
contexts
.
size
()),
contextNonbondedFractions
(
data
.
contexts
.
size
()),
CalcForcesAndEnergyKernel
(
name
,
platform
),
data
(
data
),
completionTimes
(
data
.
contexts
.
size
()),
contextNonbondedFractions
(
data
.
contexts
.
size
()),
tile
Counts
(
NULL
),
contextForces
(
NULL
),
pinnedPositionBuffer
(
NULL
),
pinnedForceBuffer
(
NULL
)
{
interaction
Counts
(
NULL
),
contextForces
(
NULL
),
pinnedPositionBuffer
(
NULL
),
pinnedForceBuffer
(
NULL
)
{
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
kernels
.
push_back
(
Kernel
(
new
CudaCalcForcesAndEnergyKernel
(
name
,
platform
,
*
data
.
contexts
[
i
])));
kernels
.
push_back
(
Kernel
(
new
CudaCalcForcesAndEnergyKernel
(
name
,
platform
,
*
data
.
contexts
[
i
])));
}
}
...
@@ -156,8 +157,8 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
...
@@ -156,8 +157,8 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
cuMemFreeHost
(
pinnedForceBuffer
);
cuMemFreeHost
(
pinnedForceBuffer
);
cuEventDestroy
(
event
);
cuEventDestroy
(
event
);
cuStreamDestroy
(
peerCopyStream
);
cuStreamDestroy
(
peerCopyStream
);
if
(
tile
Counts
!=
NULL
)
if
(
interaction
Counts
!=
NULL
)
cuMemFreeHost
(
tile
Counts
);
cuMemFreeHost
(
interaction
Counts
);
}
}
void
CudaParallelCalcForcesAndEnergyKernel
::
initialize
(
const
System
&
system
)
{
void
CudaParallelCalcForcesAndEnergyKernel
::
initialize
(
const
System
&
system
)
{
...
@@ -172,7 +173,7 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
...
@@ -172,7 +173,7 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
contextNonbondedFractions
[
i
]
=
1
/
(
double
)
numContexts
;
contextNonbondedFractions
[
i
]
=
1
/
(
double
)
numContexts
;
CHECK_RESULT
(
cuEventCreate
(
&
event
,
0
),
"Error creating event"
);
CHECK_RESULT
(
cuEventCreate
(
&
event
,
0
),
"Error creating event"
);
CHECK_RESULT
(
cuStreamCreate
(
&
peerCopyStream
,
CU_STREAM_NON_BLOCKING
),
"Error creating stream"
);
CHECK_RESULT
(
cuStreamCreate
(
&
peerCopyStream
,
CU_STREAM_NON_BLOCKING
),
"Error creating stream"
);
CHECK_RESULT
(
cuMemHostAlloc
((
void
**
)
&
tile
Counts
,
numContexts
*
sizeof
(
int
),
0
),
"Error creating
tile
count buffer"
);
CHECK_RESULT
(
cuMemHostAlloc
((
void
**
)
&
interaction
Counts
,
numContexts
*
sizeof
(
int
2
),
0
),
"Error creating
interaction
count
s
buffer"
);
}
}
void
CudaParallelCalcForcesAndEnergyKernel
::
beginComputation
(
ContextImpl
&
context
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
)
{
void
CudaParallelCalcForcesAndEnergyKernel
::
beginComputation
(
ContextImpl
&
context
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
)
{
...
@@ -202,7 +203,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
...
@@ -202,7 +203,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
data
.
contextEnergy
[
i
]
=
0.0
;
data
.
contextEnergy
[
i
]
=
0.0
;
CudaContext
&
cu
=
*
data
.
contexts
[
i
];
CudaContext
&
cu
=
*
data
.
contexts
[
i
];
CudaContext
::
WorkThread
&
thread
=
cu
.
getWorkThread
();
CudaContext
::
WorkThread
&
thread
=
cu
.
getWorkThread
();
thread
.
addTask
(
new
BeginComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
pinnedPositionBuffer
,
event
,
tile
Counts
[
i
]));
thread
.
addTask
(
new
BeginComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
pinnedPositionBuffer
,
event
,
interaction
Counts
[
i
]));
}
}
}
}
...
@@ -210,7 +211,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
...
@@ -210,7 +211,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
{
CudaContext
&
cu
=
*
data
.
contexts
[
i
];
CudaContext
&
cu
=
*
data
.
contexts
[
i
];
CudaContext
::
WorkThread
&
thread
=
cu
.
getWorkThread
();
CudaContext
::
WorkThread
&
thread
=
cu
.
getWorkThread
();
thread
.
addTask
(
new
FinishComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
data
.
contextEnergy
[
i
],
completionTimes
[
i
],
pinnedForceBuffer
,
*
contextForces
,
valid
,
tile
Counts
[
i
]));
thread
.
addTask
(
new
FinishComputationTask
(
context
,
cu
,
getKernel
(
i
),
includeForce
,
includeEnergy
,
groups
,
data
.
contextEnergy
[
i
],
completionTimes
[
i
],
pinnedForceBuffer
,
*
contextForces
,
valid
,
interaction
Counts
[
i
]));
}
}
data
.
syncContexts
();
data
.
syncContexts
();
double
energy
=
0.0
;
double
energy
=
0.0
;
...
@@ -627,6 +628,10 @@ void CudaParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int&
...
@@ -627,6 +628,10 @@ void CudaParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int&
dynamic_cast
<
const
CudaCalcNonbondedForceKernel
&>
(
kernels
[
0
].
getImpl
()).
getPMEParameters
(
alpha
,
nx
,
ny
,
nz
);
dynamic_cast
<
const
CudaCalcNonbondedForceKernel
&>
(
kernels
[
0
].
getImpl
()).
getPMEParameters
(
alpha
,
nx
,
ny
,
nz
);
}
}
void
CudaParallelCalcNonbondedForceKernel
::
getLJPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
{
dynamic_cast
<
const
CudaCalcNonbondedForceKernel
&>
(
kernels
[
0
].
getImpl
()).
getLJPMEParameters
(
alpha
,
nx
,
ny
,
nz
);
}
class
CudaParallelCalcCustomNonbondedForceKernel
::
Task
:
public
CudaContext
::
WorkTask
{
class
CudaParallelCalcCustomNonbondedForceKernel
::
Task
:
public
CudaContext
::
WorkTask
{
public:
public:
Task
(
ContextImpl
&
context
,
CudaCalcCustomNonbondedForceKernel
&
kernel
,
bool
includeForce
,
Task
(
ContextImpl
&
context
,
CudaCalcCustomNonbondedForceKernel
&
kernel
,
bool
includeForce
,
...
...
platforms/cuda/src/CudaPlatform.cpp
View file @
3b6925ae
...
@@ -247,6 +247,10 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
...
@@ -247,6 +247,10 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
CHECK_RESULT
(
cuDeviceGetName
(
name
,
1000
,
contexts
[
i
]
->
getDevice
()),
"Error querying device name"
);
CHECK_RESULT
(
cuDeviceGetName
(
name
,
1000
,
contexts
[
i
]
->
getDevice
()),
"Error querying device name"
);
deviceName
<<
name
;
deviceName
<<
name
;
}
}
size_t
printfsize
;
cuCtxGetLimit
(
&
printfsize
,
CU_LIMIT_PRINTF_FIFO_SIZE
);
cuCtxSetLimit
(
CU_LIMIT_PRINTF_FIFO_SIZE
,
10
*
printfsize
);
useCpuPme
=
(
cpuPmeProperty
==
"true"
&&
!
contexts
[
0
]
->
getUseDoublePrecision
());
useCpuPme
=
(
cpuPmeProperty
==
"true"
&&
!
contexts
[
0
]
->
getUseDoublePrecision
());
disablePmeStream
=
(
pmeStreamProperty
==
"true"
);
disablePmeStream
=
(
pmeStreamProperty
==
"true"
);
deterministicForces
=
(
deterministicForcesProperty
==
"true"
);
deterministicForces
=
(
deterministicForcesProperty
==
"true"
);
...
...
platforms/cuda/src/CudaSort.cpp
View file @
3b6925ae
...
@@ -114,13 +114,13 @@ void CudaSort::sort(CudaArray& data) {
...
@@ -114,13 +114,13 @@ void CudaSort::sort(CudaArray& data) {
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
void
*
rangeArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
,
&
dataRange
->
getDevicePointer
(),
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
void
*
rangeArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
,
&
dataRange
->
getDevicePointer
(),
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
context
.
executeKernel
(
computeRangeKernel
,
rangeArgs
,
rangeKernelSize
,
rangeKernelSize
,
rangeKernelSize
*
trait
->
getKeySize
());
context
.
executeKernel
(
computeRangeKernel
,
rangeArgs
,
rangeKernelSize
,
rangeKernelSize
,
2
*
rangeKernelSize
*
trait
->
getKeySize
());
// Assign array elements to buckets.
// Assign array elements to buckets.
void
*
elementsArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
,
&
numBuckets
,
&
dataRange
->
getDevicePointer
(),
void
*
elementsArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
,
&
numBuckets
,
&
dataRange
->
getDevicePointer
(),
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
assignElementsKernel
,
elementsArgs
,
data
.
getSize
());
context
.
executeKernel
(
assignElementsKernel
,
elementsArgs
,
data
.
getSize
()
,
128
);
// Compute the position of each bucket.
// Compute the position of each bucket.
...
...
platforms/cuda/src/kernels/coulombLennardJones.cu
View file @
3b6925ae
...
@@ -17,6 +17,26 @@
...
@@ -17,6 +17,26 @@
const
real
erfcAlphaR
=
(
0.254829592
f
+
(
-
0.284496736
f
+
(
1.421413741
f
+
(
-
1.453152027
f
+
1.061405429
f
*
t
)
*
t
)
*
t
)
*
t
)
*
t
*
expAlphaRSqr
;
const
real
erfcAlphaR
=
(
0.254829592
f
+
(
-
0.284496736
f
+
(
1.421413741
f
+
(
-
1.453152027
f
+
1.061405429
f
*
t
)
*
t
)
*
t
)
*
t
)
*
t
*
expAlphaRSqr
;
#endif
#endif
real
tempForce
=
0.0
f
;
real
tempForce
=
0.0
f
;
#if HAS_LENNARD_JONES
// The multiplicative term to correct for the multiplicative terms that are always
// present in reciprocal space. The real terms have an additive contribution
// added in, but for excluded terms the multiplicative term is just subtracted.
// These factors are needed in both clauses of the needCorrection statement, so
// I declare them up here.
#if DO_LJPME
const
real
dispersionAlphaR
=
EWALD_DISPERSION_ALPHA
*
r
;
const
real
dar2
=
dispersionAlphaR
*
dispersionAlphaR
;
const
real
dar4
=
dar2
*
dar2
;
const
real
dar6
=
dar4
*
dar2
;
const
real
invR2
=
invR
*
invR
;
const
real
expDar2
=
EXP
(
-
dar2
);
const
real2
sigExpProd
=
sigmaEpsilon1
*
sigmaEpsilon2
;
const
real
c6
=
64
*
sigExpProd
.
x
*
sigExpProd
.
x
*
sigExpProd
.
x
*
sigExpProd
.
y
;
const
real
coef
=
invR2
*
invR2
*
invR2
*
c6
;
const
real
eprefac
=
1.0
f
+
dar2
+
0.5
f
*
dar4
;
const
real
dprefac
=
eprefac
+
dar6
/
6.0
f
;
#endif
#endif
if
(
needCorrection
)
{
if
(
needCorrection
)
{
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
...
@@ -29,6 +49,13 @@
...
@@ -29,6 +49,13 @@
includeInteraction
=
false
;
includeInteraction
=
false
;
tempEnergy
-=
TWO_OVER_SQRT_PI
*
EWALD_ALPHA
*
138.935456
f
*
posq1
.
w
*
posq2
.
w
;
tempEnergy
-=
TWO_OVER_SQRT_PI
*
EWALD_ALPHA
*
138.935456
f
*
posq1
.
w
*
posq2
.
w
;
}
}
#if HAS_LENNARD_JONES
#if DO_LJPME
// The multiplicative grid term
tempEnergy
+=
coef
*
(
1.0
f
-
expDar2
*
eprefac
);
tempForce
+=
6.0
f
*
coef
*
(
1.0
f
-
expDar2
*
dprefac
);
#endif
#endif
}
}
else
{
else
{
#if HAS_LENNARD_JONES
#if HAS_LENNARD_JONES
...
@@ -36,7 +63,8 @@
...
@@ -36,7 +63,8 @@
real
sig2
=
invR
*
sig
;
real
sig2
=
invR
*
sig
;
sig2
*=
sig2
;
sig2
*=
sig2
;
real
sig6
=
sig2
*
sig2
*
sig2
;
real
sig6
=
sig2
*
sig2
*
sig2
;
real
epssig6
=
sig6
*
(
sigmaEpsilon1
.
y
*
sigmaEpsilon2
.
y
);
real
eps
=
sigmaEpsilon1
.
y
*
sigmaEpsilon2
.
y
;
real
epssig6
=
sig6
*
eps
;
tempForce
=
epssig6
*
(
12.0
f
*
sig6
-
6.0
f
);
tempForce
=
epssig6
*
(
12.0
f
*
sig6
-
6.0
f
);
real
ljEnergy
=
epssig6
*
(
sig6
-
1.0
f
);
real
ljEnergy
=
epssig6
*
(
sig6
-
1.0
f
);
#if USE_LJ_SWITCH
#if USE_LJ_SWITCH
...
@@ -48,6 +76,22 @@
...
@@ -48,6 +76,22 @@
ljEnergy
*=
switchValue
;
ljEnergy
*=
switchValue
;
}
}
#endif
#endif
#if DO_LJPME
// The multiplicative grid term
ljEnergy
+=
coef
*
(
1.0
f
-
expDar2
*
eprefac
);
tempForce
+=
6.0
f
*
coef
*
(
1.0
f
-
expDar2
*
dprefac
);
// The potential shift accounts for the step at the cutoff introduced by the
// transition from additive to multiplicative combintion rules and is only
// needed for the real (not excluded) terms. By addin these terms to ljEnergy
// instead of tempEnergy here, the includeInteraction mask is correctly applied.
sig2
=
sig
*
sig
;
sig6
=
sig2
*
sig2
*
sig2
*
INVCUT6
;
epssig6
=
eps
*
sig6
;
// The additive part of the potential shift
ljEnergy
+=
epssig6
*
(
1.0
f
-
sig6
);
// The multiplicative part of the potential shift
ljEnergy
+=
MULTSHIFT6
*
c6
;
#endif
tempForce
+=
prefactor
*
(
erfcAlphaR
+
alphaR
*
expAlphaRSqr
*
TWO_OVER_SQRT_PI
);
tempForce
+=
prefactor
*
(
erfcAlphaR
+
alphaR
*
expAlphaRSqr
*
TWO_OVER_SQRT_PI
);
tempEnergy
+=
includeInteraction
?
ljEnergy
+
prefactor
*
erfcAlphaR
:
0
;
tempEnergy
+=
includeInteraction
?
ljEnergy
+
prefactor
*
erfcAlphaR
:
0
;
#else
#else
...
...
platforms/cuda/src/kernels/customGBEnergyN2.cu
View file @
3b6925ae
...
@@ -14,7 +14,7 @@ typedef struct {
...
@@ -14,7 +14,7 @@ typedef struct {
* Compute a force based on pair interactions.
* Compute a force based on pair interactions.
*/
*/
extern
"C"
__global__
void
computeN2Energy
(
unsigned
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
extern
"C"
__global__
void
computeN2Energy
(
unsigned
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
ushort2
*
__restrict__
exclusionTiles
,
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
ushort2
*
__restrict__
exclusionTiles
,
bool
needEnergy
,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
...
@@ -78,7 +78,8 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
...
@@ -78,7 +78,8 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
COMPUTE_INTERACTION
COMPUTE_INTERACTION
dEdR
/=
-
r
;
dEdR
/=
-
r
;
}
}
energy
+=
0.5
f
*
tempEnergy
;
if
(
needEnergy
)
energy
+=
0.5
f
*
tempEnergy
;
delta
*=
dEdR
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
y
-=
delta
.
y
;
...
@@ -130,7 +131,8 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
...
@@ -130,7 +131,8 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
COMPUTE_INTERACTION
COMPUTE_INTERACTION
dEdR
/=
-
r
;
dEdR
/=
-
r
;
}
}
energy
+=
tempEnergy
;
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta
*=
dEdR
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
y
-=
delta
.
y
;
...
@@ -234,7 +236,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
...
@@ -234,7 +236,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
)
;
unsigned
int
j
=
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
];
#else
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
#endif
...
@@ -274,7 +276,8 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
...
@@ -274,7 +276,8 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
COMPUTE_INTERACTION
COMPUTE_INTERACTION
dEdR
/=
-
r
;
dEdR
/=
-
r
;
}
}
energy
+=
tempEnergy
;
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta
*=
dEdR
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
y
-=
delta
.
y
;
...
@@ -318,7 +321,8 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
...
@@ -318,7 +321,8 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
COMPUTE_INTERACTION
COMPUTE_INTERACTION
dEdR
/=
-
r
;
dEdR
/=
-
r
;
}
}
energy
+=
tempEnergy
;
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta
*=
dEdR
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
y
-=
delta
.
y
;
...
...
platforms/cuda/src/kernels/customGBValueN2.cu
View file @
3b6925ae
...
@@ -212,7 +212,7 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
...
@@ -212,7 +212,7 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
)
;
unsigned
int
j
=
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
];
#else
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
#endif
...
...
platforms/cuda/src/kernels/customIntegratorPerDof.cu
View file @
3b6925ae
...
@@ -34,7 +34,7 @@ inline __device__ mixed4 convertFromDouble4(double4 a) {
...
@@ -34,7 +34,7 @@ inline __device__ mixed4 convertFromDouble4(double4 a) {
extern
"C"
__global__
void
computePerDof
(
real4
*
__restrict__
posq
,
real4
*
__restrict__
posqCorrection
,
mixed4
*
__restrict__
posDelta
,
extern
"C"
__global__
void
computePerDof
(
real4
*
__restrict__
posq
,
real4
*
__restrict__
posqCorrection
,
mixed4
*
__restrict__
posDelta
,
mixed4
*
__restrict__
velm
,
const
long
long
*
__restrict__
force
,
const
mixed2
*
__restrict__
dt
,
const
mixed
*
__restrict__
globals
,
mixed4
*
__restrict__
velm
,
const
long
long
*
__restrict__
force
,
const
mixed2
*
__restrict__
dt
,
const
mixed
*
__restrict__
globals
,
mixed
*
__restrict__
sum
,
const
float4
*
__restrict__
gaussianValues
,
unsigned
int
gaussianBaseIndex
,
const
float4
*
__restrict__
uniformValues
,
mixed
*
__restrict__
sum
,
const
float4
*
__restrict__
gaussianValues
,
unsigned
int
gaussianBaseIndex
,
const
float4
*
__restrict__
uniformValues
,
const
real
energy
,
mixed
*
__restrict__
energyParamDerivs
const
mixed
energy
,
mixed
*
__restrict__
energyParamDerivs
PARAMETER_ARGUMENTS
)
{
PARAMETER_ARGUMENTS
)
{
mixed
stepSize
=
dt
[
0
].
y
;
mixed
stepSize
=
dt
[
0
].
y
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
...
...
platforms/cuda/src/kernels/customNonbonded.cu
View file @
3b6925ae
...
@@ -14,8 +14,10 @@ if (!isExcluded) {
...
@@ -14,8 +14,10 @@ if (!isExcluded) {
#endif
#endif
COMPUTE_FORCE
COMPUTE_FORCE
#if USE_SWITCH
#if USE_SWITCH
tempForce
=
tempForce
*
switchValue
-
tempEnergy
*
switchDeriv
;
tempForce
=
tempForce
*
switchValue
-
customEnergy
*
switchDeriv
;
tempEnergy
*=
switchValue
;
tempEnergy
+=
customEnergy
*
switchValue
;
#else
tempEnergy
+=
customEnergy
;
#endif
#endif
dEdR
+=
tempForce
*
invR
;
dEdR
+=
tempForce
*
invR
;
}
}
platforms/cuda/src/kernels/findInteractingBlocks.cu
View file @
3b6925ae
...
@@ -27,8 +27,19 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
...
@@ -27,8 +27,19 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
maxPos
=
make_real4
(
max
(
maxPos
.
x
,
pos
.
x
),
max
(
maxPos
.
y
,
pos
.
y
),
max
(
maxPos
.
z
,
pos
.
z
),
0
);
maxPos
=
make_real4
(
max
(
maxPos
.
x
,
pos
.
x
),
max
(
maxPos
.
y
,
pos
.
y
),
max
(
maxPos
.
z
,
pos
.
z
),
0
);
}
}
real4
blockSize
=
0.5
f
*
(
maxPos
-
minPos
);
real4
blockSize
=
0.5
f
*
(
maxPos
-
minPos
);
real4
center
=
0.5
f
*
(
maxPos
+
minPos
);
center
.
w
=
0
;
for
(
int
i
=
base
;
i
<
last
;
i
++
)
{
pos
=
posq
[
i
];
real4
delta
=
posq
[
i
]
-
center
;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
center
.
w
=
max
(
center
.
w
,
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
);
}
center
.
w
=
sqrt
(
center
.
w
);
blockBoundingBox
[
index
]
=
blockSize
;
blockBoundingBox
[
index
]
=
blockSize
;
blockCenter
[
index
]
=
0.5
f
*
(
maxPos
+
minPos
)
;
blockCenter
[
index
]
=
center
;
sortedBlocks
[
index
]
=
make_real2
(
blockSize
.
x
+
blockSize
.
y
+
blockSize
.
z
,
index
);
sortedBlocks
[
index
]
=
make_real2
(
blockSize
.
x
+
blockSize
.
y
+
blockSize
.
z
,
index
);
index
+=
blockDim
.
x
*
gridDim
.
x
;
index
+=
blockDim
.
x
*
gridDim
.
x
;
base
=
index
*
TILE_SIZE
;
base
=
index
*
TILE_SIZE
;
...
@@ -61,9 +72,60 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
...
@@ -61,9 +72,60 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
if
(
rebuild
)
{
if
(
rebuild
)
{
rebuildNeighborList
[
0
]
=
1
;
rebuildNeighborList
[
0
]
=
1
;
interactionCount
[
0
]
=
0
;
interactionCount
[
0
]
=
0
;
interactionCount
[
1
]
=
0
;
}
}
}
}
__device__
int
saveSinglePairs
(
int
x
,
int
*
atoms
,
int
*
flags
,
int
length
,
unsigned
int
maxSinglePairs
,
unsigned
int
*
singlePairCount
,
int2
*
singlePairs
,
int
*
sumBuffer
,
volatile
int
&
pairStartIndex
)
{
// Record interactions that should be computed as single pairs rather than in blocks.
const
int
indexInWarp
=
threadIdx
.
x
%
32
;
int
sum
=
0
;
for
(
int
i
=
indexInWarp
;
i
<
length
;
i
+=
32
)
{
int
count
=
__popc
(
flags
[
i
]);
sum
+=
(
count
<=
MAX_BITS_FOR_PAIRS
?
count
:
0
);
}
sumBuffer
[
indexInWarp
]
=
sum
;
for
(
int
step
=
1
;
step
<
32
;
step
*=
2
)
{
int
add
=
(
indexInWarp
>=
step
?
sumBuffer
[
indexInWarp
-
step
]
:
0
);
sumBuffer
[
indexInWarp
]
+=
add
;
}
int
pairsToStore
=
sumBuffer
[
31
];
if
(
indexInWarp
==
0
)
pairStartIndex
=
atomicAdd
(
singlePairCount
,
pairsToStore
);
int
pairIndex
=
pairStartIndex
+
(
indexInWarp
>
0
?
sumBuffer
[
indexInWarp
-
1
]
:
0
);
for
(
int
i
=
indexInWarp
;
i
<
length
;
i
+=
32
)
{
int
count
=
__popc
(
flags
[
i
]);
if
(
count
<=
MAX_BITS_FOR_PAIRS
&&
pairIndex
+
count
<
maxSinglePairs
)
{
int
f
=
flags
[
i
];
while
(
f
!=
0
)
{
singlePairs
[
pairIndex
]
=
make_int2
(
atoms
[
i
],
x
*
TILE_SIZE
+
__ffs
(
f
)
-
1
);
f
&=
f
-
1
;
pairIndex
++
;
}
}
}
// Compact the remaining interactions.
const
int
warpMask
=
(
1
<<
indexInWarp
)
-
1
;
int
numCompacted
=
0
;
for
(
int
start
=
0
;
start
<
length
;
start
+=
32
)
{
int
i
=
start
+
indexInWarp
;
int
atom
=
atoms
[
i
];
int
flag
=
flags
[
i
];
bool
include
=
(
i
<
length
&&
__popc
(
flags
[
i
])
>
MAX_BITS_FOR_PAIRS
);
int
includeFlags
=
__ballot
(
include
);
if
(
include
)
{
int
index
=
numCompacted
+
__popc
(
includeFlags
&
warpMask
);
atoms
[
index
]
=
atom
;
flags
[
index
]
=
flag
;
}
numCompacted
+=
__popc
(
includeFlags
);
}
return
numCompacted
;
}
/**
/**
* Compare the bounding boxes for each pair of atom blocks (comprised of 32 atoms each), forming a tile. If the two
* Compare the bounding boxes for each pair of atom blocks (comprised of 32 atoms each), forming a tile. If the two
* atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
* atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
...
@@ -114,8 +176,9 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
...
@@ -114,8 +176,9 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
*
*
*/
*/
extern
"C"
__global__
void
findBlocksWithInteractions
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
extern
"C"
__global__
void
findBlocksWithInteractions
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
*
__restrict__
interactionCount
,
int
*
__restrict__
interactingTiles
,
unsigned
int
*
__restrict__
interactingAtoms
,
const
real4
*
__restrict__
posq
,
unsigned
int
*
__restrict__
interactionCount
,
int
*
__restrict__
interactingTiles
,
unsigned
int
*
__restrict__
interactingAtoms
,
unsigned
int
maxTiles
,
unsigned
int
startBlockIndex
,
unsigned
int
numBlocks
,
real2
*
__restrict__
sortedBlocks
,
const
real4
*
__restrict__
sortedBlockCenter
,
int2
*
__restrict__
singlePairs
,
const
real4
*
__restrict__
posq
,
unsigned
int
maxTiles
,
unsigned
int
maxSinglePairs
,
unsigned
int
startBlockIndex
,
unsigned
int
numBlocks
,
real2
*
__restrict__
sortedBlocks
,
const
real4
*
__restrict__
sortedBlockCenter
,
const
real4
*
__restrict__
sortedBlockBoundingBox
,
const
unsigned
int
*
__restrict__
exclusionIndices
,
const
unsigned
int
*
__restrict__
exclusionRowIndices
,
const
real4
*
__restrict__
sortedBlockBoundingBox
,
const
unsigned
int
*
__restrict__
exclusionIndices
,
const
unsigned
int
*
__restrict__
exclusionRowIndices
,
real4
*
__restrict__
oldPositions
,
const
int
*
__restrict__
rebuildNeighborList
)
{
real4
*
__restrict__
oldPositions
,
const
int
*
__restrict__
rebuildNeighborList
)
{
...
@@ -128,12 +191,17 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
...
@@ -128,12 +191,17 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
const
int
warpIndex
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
32
;
const
int
warpIndex
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
32
;
const
int
warpMask
=
(
1
<<
indexInWarp
)
-
1
;
const
int
warpMask
=
(
1
<<
indexInWarp
)
-
1
;
__shared__
int
workgroupBuffer
[
BUFFER_SIZE
*
(
GROUP_SIZE
/
32
)];
__shared__
int
workgroupBuffer
[
BUFFER_SIZE
*
(
GROUP_SIZE
/
32
)];
__shared__
int
workgroupFlagsBuffer
[
BUFFER_SIZE
*
(
GROUP_SIZE
/
32
)];
__shared__
int
warpExclusions
[
MAX_EXCLUSIONS
*
(
GROUP_SIZE
/
32
)];
__shared__
int
warpExclusions
[
MAX_EXCLUSIONS
*
(
GROUP_SIZE
/
32
)];
__shared__
real3
posBuffer
[
GROUP_SIZE
];
__shared__
real3
posBuffer
[
GROUP_SIZE
];
__shared__
volatile
int
workgroupTileIndex
[
GROUP_SIZE
/
32
];
__shared__
volatile
int
workgroupTileIndex
[
GROUP_SIZE
/
32
];
__shared__
int
sumBuffer
[
GROUP_SIZE
];
__shared__
int
worksgroupPairStartIndex
[
GROUP_SIZE
/
32
];
int
*
buffer
=
workgroupBuffer
+
BUFFER_SIZE
*
(
warpStart
/
32
);
int
*
buffer
=
workgroupBuffer
+
BUFFER_SIZE
*
(
warpStart
/
32
);
int
*
flagsBuffer
=
workgroupFlagsBuffer
+
BUFFER_SIZE
*
(
warpStart
/
32
);
int
*
exclusionsForX
=
warpExclusions
+
MAX_EXCLUSIONS
*
(
warpStart
/
32
);
int
*
exclusionsForX
=
warpExclusions
+
MAX_EXCLUSIONS
*
(
warpStart
/
32
);
volatile
int
&
tileStartIndex
=
workgroupTileIndex
[
warpStart
/
32
];
volatile
int
&
tileStartIndex
=
workgroupTileIndex
[
warpStart
/
32
];
volatile
int
&
pairStartIndex
=
worksgroupPairStartIndex
[
warpStart
/
32
];
// Loop over blocks.
// Loop over blocks.
...
@@ -176,16 +244,24 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
...
@@ -176,16 +244,24 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
int
block2
=
block2Base
+
indexInWarp
;
int
block2
=
block2Base
+
indexInWarp
;
bool
includeBlock2
=
(
block2
<
NUM_BLOCKS
);
bool
includeBlock2
=
(
block2
<
NUM_BLOCKS
);
if
(
includeBlock2
)
{
if
(
includeBlock2
)
{
real4
blockCenterY
=
(
block2
<
NUM_BLOCKS
?
sortedBlockCenter
[
block2
]
:
make_real4
(
0
))
;
real4
blockCenterY
=
sortedBlockCenter
[
block2
];
real4
blockSizeY
=
(
block2
<
NUM_BLOCKS
?
sortedBlockBoundingBox
[
block2
]
:
make_real4
(
0
))
;
real4
blockSizeY
=
sortedBlockBoundingBox
[
block2
];
real4
blockDelta
=
blockCenterX
-
blockCenterY
;
real4
blockDelta
=
blockCenterX
-
blockCenterY
;
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
blockDelta
)
APPLY_PERIODIC_TO_DELTA
(
blockDelta
)
#endif
#endif
includeBlock2
&=
(
blockDelta
.
x
*
blockDelta
.
x
+
blockDelta
.
y
*
blockDelta
.
y
+
blockDelta
.
z
*
blockDelta
.
z
<
(
PADDED_CUTOFF
+
blockCenterX
.
w
+
blockCenterY
.
w
)
*
(
PADDED_CUTOFF
+
blockCenterX
.
w
+
blockCenterY
.
w
));
blockDelta
.
x
=
max
(
0.0
f
,
fabs
(
blockDelta
.
x
)
-
blockSizeX
.
x
-
blockSizeY
.
x
);
blockDelta
.
x
=
max
(
0.0
f
,
fabs
(
blockDelta
.
x
)
-
blockSizeX
.
x
-
blockSizeY
.
x
);
blockDelta
.
y
=
max
(
0.0
f
,
fabs
(
blockDelta
.
y
)
-
blockSizeX
.
y
-
blockSizeY
.
y
);
blockDelta
.
y
=
max
(
0.0
f
,
fabs
(
blockDelta
.
y
)
-
blockSizeX
.
y
-
blockSizeY
.
y
);
blockDelta
.
z
=
max
(
0.0
f
,
fabs
(
blockDelta
.
z
)
-
blockSizeX
.
z
-
blockSizeY
.
z
);
blockDelta
.
z
=
max
(
0.0
f
,
fabs
(
blockDelta
.
z
)
-
blockSizeX
.
z
-
blockSizeY
.
z
);
includeBlock2
&=
(
blockDelta
.
x
*
blockDelta
.
x
+
blockDelta
.
y
*
blockDelta
.
y
+
blockDelta
.
z
*
blockDelta
.
z
<
PADDED_CUTOFF_SQUARED
);
includeBlock2
&=
(
blockDelta
.
x
*
blockDelta
.
x
+
blockDelta
.
y
*
blockDelta
.
y
+
blockDelta
.
z
*
blockDelta
.
z
<
PADDED_CUTOFF_SQUARED
);
#ifdef TRICLINIC
// The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
// If there's any possibility we might have missed it, do a detailed check.
if
(
periodicBoxSize
.
z
/
2
-
blockSizeX
.
z
-
blockSizeY
.
z
<
PADDED_CUTOFF
||
periodicBoxSize
.
y
/
2
-
blockSizeX
.
y
-
blockSizeY
.
y
<
PADDED_CUTOFF
)
includeBlock2
=
true
;
#endif
if
(
includeBlock2
)
{
if
(
includeBlock2
)
{
unsigned
short
y
=
(
unsigned
short
)
sortedBlocks
[
block2
].
y
;
unsigned
short
y
=
(
unsigned
short
)
sortedBlocks
[
block2
].
y
;
for
(
int
k
=
0
;
k
<
numExclusions
;
k
++
)
for
(
int
k
=
0
;
k
<
numExclusions
;
k
++
)
...
@@ -203,29 +279,36 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
...
@@ -203,29 +279,36 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
// Check each atom in block Y for interactions.
// Check each atom in block Y for interactions.
int
start
=
y
*
TILE_SIZE
;
int
atom2
=
y
*
TILE_SIZE
+
indexInWarp
;
int
atom2
=
start
+
indexInWarp
;
real3
pos2
=
trimTo3
(
posq
[
atom2
]);
real3
pos2
=
trimTo3
(
posq
[
atom2
]);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
if
(
singlePeriodicCopy
)
{
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos2
,
blockCenterX
)
APPLY_PERIODIC_TO_POS_WITH_CENTER
(
pos2
,
blockCenterX
)
}
}
#endif
#endif
bool
interacts
=
false
;
real4
blockCenterY
=
sortedBlockCenter
[
block2Base
+
i
];
if
(
atom2
<
NUM_ATOMS
)
{
real3
atomDelta
=
posBuffer
[
warpStart
+
indexInWarp
]
-
trimTo3
(
blockCenterY
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
atomDelta
)
#endif
int
atomFlags
=
ballot
(
atomDelta
.
x
*
atomDelta
.
x
+
atomDelta
.
y
*
atomDelta
.
y
+
atomDelta
.
z
*
atomDelta
.
z
<
(
PADDED_CUTOFF
+
blockCenterY
.
w
)
*
(
PADDED_CUTOFF
+
blockCenterY
.
w
));
int
interacts
=
0
;
if
(
atom2
<
NUM_ATOMS
&&
atomFlags
!=
0
)
{
int
first
=
__ffs
(
atomFlags
)
-
1
;
int
last
=
32
-
__clz
(
atomFlags
);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
if
(
!
singlePeriodicCopy
)
{
if
(
!
singlePeriodicCopy
)
{
for
(
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
int
j
=
first
;
j
<
last
;
j
++
)
{
real3
delta
=
pos2
-
posBuffer
[
warpStart
+
j
];
real3
delta
=
pos2
-
posBuffer
[
warpStart
+
j
];
APPLY_PERIODIC_TO_DELTA
(
delta
)
APPLY_PERIODIC_TO_DELTA
(
delta
)
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
);
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
?
1
<<
j
:
0
);
}
}
}
}
else
{
else
{
#endif
#endif
for
(
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
int
j
=
first
;
j
<
last
;
j
++
)
{
real3
delta
=
pos2
-
posBuffer
[
warpStart
+
j
];
real3
delta
=
pos2
-
posBuffer
[
warpStart
+
j
];
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
);
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
?
1
<<
j
:
0
);
}
}
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
}
}
...
@@ -235,34 +318,46 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
...
@@ -235,34 +318,46 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
// Add any interacting atoms to the buffer.
// Add any interacting atoms to the buffer.
int
includeAtomFlags
=
__ballot
(
interacts
);
int
includeAtomFlags
=
__ballot
(
interacts
);
if
(
interacts
)
if
(
interacts
)
{
buffer
[
neighborsInBuffer
+
__popc
(
includeAtomFlags
&
warpMask
)]
=
atom2
;
int
index
=
neighborsInBuffer
+
__popc
(
includeAtomFlags
&
warpMask
);
buffer
[
index
]
=
atom2
;
flagsBuffer
[
index
]
=
interacts
;
}
neighborsInBuffer
+=
__popc
(
includeAtomFlags
);
neighborsInBuffer
+=
__popc
(
includeAtomFlags
);
if
(
neighborsInBuffer
>
BUFFER_SIZE
-
TILE_SIZE
)
{
if
(
neighborsInBuffer
>
BUFFER_SIZE
-
TILE_SIZE
)
{
// Store the new tiles to memory.
// Store the new tiles to memory.
#if MAX_BITS_FOR_PAIRS > 0
neighborsInBuffer
=
saveSinglePairs
(
x
,
buffer
,
flagsBuffer
,
neighborsInBuffer
,
maxSinglePairs
,
&
interactionCount
[
1
],
singlePairs
,
sumBuffer
+
warpStart
,
pairStartIndex
);
#endif
int
tilesToStore
=
neighborsInBuffer
/
TILE_SIZE
;
int
tilesToStore
=
neighborsInBuffer
/
TILE_SIZE
;
if
(
indexInWarp
==
0
)
if
(
tilesToStore
>
0
)
{
tileStartIndex
=
atomicAdd
(
interactionCount
,
tilesToStore
);
if
(
indexInWarp
==
0
)
int
newTileStartIndex
=
tileStartIndex
;
tileStartIndex
=
atomicAdd
(
&
interactionCount
[
0
],
tilesToStore
);
if
(
newTileStartIndex
+
tilesToStore
<=
maxTiles
)
{
int
newTileStartIndex
=
tileStartIndex
;
if
(
indexInWarp
<
tilesToStore
)
if
(
newTileStartIndex
+
tilesToStore
<=
maxTiles
)
{
interactingTiles
[
newTileStartIndex
+
indexInWarp
]
=
x
;
if
(
indexInWarp
<
tilesToStore
)
for
(
int
j
=
0
;
j
<
tilesToStore
;
j
++
)
interactingTiles
[
newTileStartIndex
+
indexInWarp
]
=
x
;
interactingAtoms
[(
newTileStartIndex
+
j
)
*
TILE_SIZE
+
indexInWarp
]
=
buffer
[
indexInWarp
+
j
*
TILE_SIZE
];
for
(
int
j
=
0
;
j
<
tilesToStore
;
j
++
)
interactingAtoms
[(
newTileStartIndex
+
j
)
*
TILE_SIZE
+
indexInWarp
]
=
buffer
[
indexInWarp
+
j
*
TILE_SIZE
];
}
buffer
[
indexInWarp
]
=
buffer
[
indexInWarp
+
TILE_SIZE
*
tilesToStore
];
neighborsInBuffer
-=
TILE_SIZE
*
tilesToStore
;
}
}
buffer
[
indexInWarp
]
=
buffer
[
indexInWarp
+
TILE_SIZE
*
tilesToStore
];
neighborsInBuffer
-=
TILE_SIZE
*
tilesToStore
;
}
}
}
}
}
}
// If we have a partially filled buffer, store it to memory.
// If we have a partially filled buffer, store it to memory.
#if MAX_BITS_FOR_PAIRS > 0
if
(
neighborsInBuffer
>
32
)
neighborsInBuffer
=
saveSinglePairs
(
x
,
buffer
,
flagsBuffer
,
neighborsInBuffer
,
maxSinglePairs
,
&
interactionCount
[
1
],
singlePairs
,
sumBuffer
+
warpStart
,
pairStartIndex
);
#endif
if
(
neighborsInBuffer
>
0
)
{
if
(
neighborsInBuffer
>
0
)
{
int
tilesToStore
=
(
neighborsInBuffer
+
TILE_SIZE
-
1
)
/
TILE_SIZE
;
int
tilesToStore
=
(
neighborsInBuffer
+
TILE_SIZE
-
1
)
/
TILE_SIZE
;
if
(
indexInWarp
==
0
)
if
(
indexInWarp
==
0
)
tileStartIndex
=
atomicAdd
(
interactionCount
,
tilesToStore
);
tileStartIndex
=
atomicAdd
(
&
interactionCount
[
0
]
,
tilesToStore
);
int
newTileStartIndex
=
tileStartIndex
;
int
newTileStartIndex
=
tileStartIndex
;
if
(
newTileStartIndex
+
tilesToStore
<=
maxTiles
)
{
if
(
newTileStartIndex
+
tilesToStore
<=
maxTiles
)
{
if
(
indexInWarp
<
tilesToStore
)
if
(
indexInWarp
<
tilesToStore
)
...
@@ -277,4 +372,4 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
...
@@ -277,4 +372,4 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
for
(
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
for
(
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
oldPositions
[
i
]
=
posq
[
i
];
oldPositions
[
i
]
=
posq
[
i
];
}
}
\ No newline at end of file
platforms/cuda/src/kernels/gbsaObc1.cu
View file @
3b6925ae
...
@@ -264,7 +264,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
...
@@ -264,7 +264,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
real4
posq1
=
posq
[
atom1
];
real4
posq1
=
posq
[
atom1
];
float2
params1
=
global_params
[
atom1
];
float2
params1
=
global_params
[
atom1
];
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
)
;
unsigned
int
j
=
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
];
#else
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
#endif
...
@@ -400,7 +400,7 @@ typedef struct {
...
@@ -400,7 +400,7 @@ typedef struct {
*/
*/
extern
"C"
__global__
void
computeGBSAForce1
(
unsigned
long
long
*
__restrict__
forceBuffers
,
unsigned
long
long
*
__restrict__
global_bornForce
,
extern
"C"
__global__
void
computeGBSAForce1
(
unsigned
long
long
*
__restrict__
forceBuffers
,
unsigned
long
long
*
__restrict__
global_bornForce
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
global_bornRadii
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
global_bornRadii
,
bool
needEnergy
,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
...
@@ -465,7 +465,8 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
...
@@ -465,7 +465,8 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
if
(
atom1
!=
y
*
TILE_SIZE
+
j
)
if
(
atom1
!=
y
*
TILE_SIZE
+
j
)
tempEnergy
-=
scaledChargeProduct
/
CUTOFF
;
tempEnergy
-=
scaledChargeProduct
/
CUTOFF
;
#endif
#endif
energy
+=
0.5
f
*
tempEnergy
;
if
(
needEnergy
)
energy
+=
0.5
f
*
tempEnergy
;
delta
*=
dEdR
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
y
-=
delta
.
y
;
...
@@ -519,7 +520,8 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
...
@@ -519,7 +520,8 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
tempEnergy
-=
scaledChargeProduct
/
CUTOFF
;
tempEnergy
-=
scaledChargeProduct
/
CUTOFF
;
#endif
#endif
energy
+=
tempEnergy
;
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta
*=
dEdR
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
y
-=
delta
.
y
;
...
@@ -617,7 +619,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
...
@@ -617,7 +619,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
real4
posq1
=
posq
[
atom1
];
real4
posq1
=
posq
[
atom1
];
real
bornRadius1
=
global_bornRadii
[
atom1
];
real
bornRadius1
=
global_bornRadii
[
atom1
];
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
)
;
unsigned
int
j
=
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
];
#else
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
#endif
...
@@ -667,7 +669,8 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
...
@@ -667,7 +669,8 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
tempEnergy
-=
scaledChargeProduct
/
CUTOFF
;
tempEnergy
-=
scaledChargeProduct
/
CUTOFF
;
#endif
#endif
energy
+=
tempEnergy
;
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta
*=
dEdR
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
y
-=
delta
.
y
;
...
@@ -716,7 +719,8 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
...
@@ -716,7 +719,8 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
tempEnergy
-=
scaledChargeProduct
/
CUTOFF
;
tempEnergy
-=
scaledChargeProduct
/
CUTOFF
;
#endif
#endif
energy
+=
tempEnergy
;
if
(
needEnergy
)
energy
+=
tempEnergy
;
delta
*=
dEdR
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
y
-=
delta
.
y
;
...
...
platforms/cuda/src/kernels/langevin.cu
View file @
3b6925ae
...
@@ -78,7 +78,7 @@ extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restric
...
@@ -78,7 +78,7 @@ extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restric
* Select the step size to use for the next step.
* Select the step size to use for the next step.
*/
*/
extern
"C"
__global__
void
selectLangevinStepSize
(
int
numAtoms
,
int
paddedNumAtoms
,
mixed
maxStepSize
,
mixed
errorTol
,
mixed
tau
,
mixed
kT
,
mixed2
*
__restrict__
dt
,
extern
"C"
__global__
void
selectLangevinStepSize
(
int
numAtoms
,
int
paddedNumAtoms
,
mixed
maxStepSize
,
mixed
errorTol
,
mixed
friction
,
mixed
kT
,
mixed2
*
__restrict__
dt
,
const
mixed4
*
__restrict__
velm
,
const
long
long
*
__restrict__
force
,
mixed
*
__restrict__
paramBuffer
)
{
const
mixed4
*
__restrict__
velm
,
const
long
long
*
__restrict__
force
,
mixed
*
__restrict__
paramBuffer
)
{
// Calculate the error.
// Calculate the error.
...
@@ -119,9 +119,9 @@ extern "C" __global__ void selectLangevinStepSize(int numAtoms, int paddedNumAto
...
@@ -119,9 +119,9 @@ extern "C" __global__ void selectLangevinStepSize(int numAtoms, int paddedNumAto
// Recalculate the integration parameters.
// Recalculate the integration parameters.
mixed
vscale
=
EXP
(
-
newStepSize
/
tau
);
mixed
vscale
=
exp
(
-
newStepSize
*
friction
);
mixed
fscale
=
(
1
-
vscale
)
*
tau
;
mixed
fscale
=
(
friction
==
0
?
newStepSize
:
(
1
-
vscale
)
/
friction
)
;
mixed
noisescale
=
SQRT
(
2
*
kT
/
tau
)
*
SQRT
(
0.5
f
*
(
1
-
vscale
*
vscale
)
*
tau
);
mixed
noisescale
=
sqrt
(
kT
*
(
1
-
vscale
*
vscale
));
params
[
VelScale
]
=
vscale
;
params
[
VelScale
]
=
vscale
;
params
[
ForceScale
]
=
fscale
;
params
[
ForceScale
]
=
fscale
;
params
[
NoiseScale
]
=
noisescale
;
params
[
NoiseScale
]
=
noisescale
;
...
...
platforms/cuda/src/kernels/nonbonded.cu
View file @
3b6925ae
...
@@ -103,9 +103,10 @@ extern "C" __global__ void computeNonbonded(
...
@@ -103,9 +103,10 @@ extern "C" __global__ void computeNonbonded(
unsigned
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
tileflags
*
__restrict__
exclusions
,
unsigned
long
long
*
__restrict__
forceBuffers
,
mixed
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
tileflags
*
__restrict__
exclusions
,
const
ushort2
*
__restrict__
exclusionTiles
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
const
ushort2
*
__restrict__
exclusionTiles
,
unsigned
int
startTileIndex
,
unsigned
int
numTileIndices
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
,
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
,
const
int
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
const
real4
*
__restrict__
blockSize
,
const
unsigned
int
*
__restrict__
interactingAtoms
,
unsigned
int
maxSinglePairs
,
const
int2
*
__restrict__
singlePairs
#endif
#endif
PARAMETER_ARGUMENTS
)
{
PARAMETER_ARGUMENTS
)
{
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
...
@@ -278,10 +279,10 @@ extern "C" __global__ void computeNonbonded(
...
@@ -278,10 +279,10 @@ extern "C" __global__ void computeNonbonded(
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
#endif
#endif
#endif // end USE_SYMMETRIC
#endif // end USE_SYMMETRIC
#endif
#ifdef ENABLE_SHUFFLE
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
SHUFFLE_WARP_DATA
#endif
#endif
#endif
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
excl
>>=
1
;
#endif
#endif
...
@@ -379,7 +380,7 @@ extern "C" __global__ void computeNonbonded(
...
@@ -379,7 +380,7 @@ extern "C" __global__ void computeNonbonded(
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
//const unsigned int localAtomIndex = threadIdx.x;
//const unsigned int localAtomIndex = threadIdx.x;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
)
;
unsigned
int
j
=
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
];
#else
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
#endif
...
@@ -484,9 +485,9 @@ extern "C" __global__ void computeNonbonded(
...
@@ -484,9 +485,9 @@ extern "C" __global__ void computeNonbonded(
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
#endif
#endif
#endif // end USE_SYMMETRIC
#endif // end USE_SYMMETRIC
#endif
#ifdef ENABLE_SHUFFLE
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
SHUFFLE_WARP_DATA
#endif
#endif
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
...
@@ -555,9 +556,9 @@ extern "C" __global__ void computeNonbonded(
...
@@ -555,9 +556,9 @@ extern "C" __global__ void computeNonbonded(
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
localData
[
tbx
+
tj
].
fz
+=
dEdR2
.
z
;
#endif
#endif
#endif // end USE_SYMMETRIC
#endif // end USE_SYMMETRIC
#endif
#ifdef ENABLE_SHUFFLE
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
SHUFFLE_WARP_DATA
#endif
#endif
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
...
@@ -588,6 +589,59 @@ extern "C" __global__ void computeNonbonded(
...
@@ -588,6 +589,59 @@ extern "C" __global__ void computeNonbonded(
}
}
pos
++
;
pos
++
;
}
}
// Third loop: single pairs that aren't part of a tile.
#if USE_CUTOFF
const
unsigned
int
numPairs
=
interactionCount
[
1
];
if
(
numPairs
>
maxSinglePairs
)
return
;
// There wasn't enough memory for the neighbor list.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
numPairs
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int2
pair
=
singlePairs
[
i
];
int
atom1
=
pair
.
x
;
int
atom2
=
pair
.
y
;
real4
posq1
=
posq
[
atom1
];
real4
posq2
=
posq
[
atom2
];
LOAD_ATOM1_PARAMETERS
int
j
=
atom2
;
atom2
=
threadIdx
.
x
;
DECLARE_LOCAL_PARAMETERS
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_ATOM2_PARAMETERS
atom2
=
pair
.
y
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA
(
delta
)
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
invR
=
RSQRT
(
r2
);
real
r
=
r2
*
invR
;
#ifdef USE_SYMMETRIC
real
dEdR
=
0.0
f
;
#else
real3
dEdR1
=
make_real3
(
0
);
real3
dEdR2
=
make_real3
(
0
);
#endif
bool
hasExclusions
=
false
;
bool
isExcluded
=
false
;
real
tempEnergy
=
0.0
f
;
const
real
interactionScale
=
1.0
f
;
COMPUTE_INTERACTION
energy
+=
tempEnergy
;
#ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC
real3
dEdR1
=
delta
*
dEdR
;
real3
dEdR2
=
-
dEdR1
;
#endif
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR1
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR1
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR1
.
z
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR2
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR2
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
-
dEdR2
.
z
*
0x100000000
)));
#endif
}
#endif
#ifdef INCLUDE_ENERGY
#ifdef INCLUDE_ENERGY
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
#endif
#endif
...
...
platforms/cuda/src/kernels/pme.cu
View file @
3b6925ae
...
@@ -21,7 +21,11 @@ extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int
...
@@ -21,7 +21,11 @@ extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int
extern
"C"
__global__
void
gridSpreadCharge
(
const
real4
*
__restrict__
posq
,
real
*
__restrict__
originalPmeGrid
,
extern
"C"
__global__
void
gridSpreadCharge
(
const
real4
*
__restrict__
posq
,
real
*
__restrict__
originalPmeGrid
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
const
int2
*
__restrict__
pmeAtomGridIndex
)
{
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
const
int2
*
__restrict__
pmeAtomGridIndex
#ifdef USE_LJPME
,
const
real2
*
__restrict__
sigmaEpsilon
#endif
)
{
real3
data
[
PME_ORDER
];
real3
data
[
PME_ORDER
];
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
...
@@ -62,7 +66,13 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
...
@@ -62,7 +66,13 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
data
[
0
]
=
scale
*
(
make_real3
(
1
)
-
dr
)
*
data
[
0
];
data
[
0
]
=
scale
*
(
make_real3
(
1
)
-
dr
)
*
data
[
0
];
// Spread the charge from this atom onto each grid point.
// Spread the charge from this atom onto each grid point.
#ifdef USE_LJPME
const
real2
sigEps
=
sigmaEpsilon
[
atom
];
const
real
charge
=
8
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
y
;
#else
const
real
charge
=
pos
.
w
;
#endif
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndex
.
x
+
ix
;
int
xbase
=
gridIndex
.
x
+
ix
;
xbase
-=
(
xbase
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
xbase
-=
(
xbase
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
...
@@ -80,7 +90,7 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
...
@@ -80,7 +90,7 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
int
index
=
ybase
+
zindex
;
int
index
=
ybase
+
zindex
;
real
add
=
pos
.
w
*
dx
*
dy
*
data
[
iz
].
z
;
real
add
=
charge
*
dx
*
dy
*
data
[
iz
].
z
;
#ifdef USE_DOUBLE_PRECISION
#ifdef USE_DOUBLE_PRECISION
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
originalPmeGrid
;
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
originalPmeGrid
;
atomicAdd
(
&
ulonglong_p
[
index
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add
*
0x100000000
)));
atomicAdd
(
&
ulonglong_p
[
index
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add
*
0x100000000
)));
...
@@ -121,7 +131,15 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict
...
@@ -121,7 +131,15 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict
real4
periodicBoxSize
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
real4
periodicBoxSize
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
// R2C stores into a half complex matrix where the last dimension is cut by half
// R2C stores into a half complex matrix where the last dimension is cut by half
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
(
GRID_SIZE_Z
/
2
+
1
);
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
(
GRID_SIZE_Z
/
2
+
1
);
#ifdef USE_LJPME
const
real
recipScaleFactor
=
-
2
*
M_PI
*
SQRT
(
M_PI
)
*
RECIP
(
6
*
periodicBoxSize
.
x
*
periodicBoxSize
.
y
*
periodicBoxSize
.
z
);
real
bfac
=
M_PI
/
EWALD_ALPHA
;
real
fac1
=
2
*
M_PI
*
M_PI
*
M_PI
*
SQRT
(
M_PI
);
real
fac2
=
EWALD_ALPHA
*
EWALD_ALPHA
*
EWALD_ALPHA
;
real
fac3
=
-
2
*
EWALD_ALPHA
*
M_PI
*
M_PI
;
#else
const
real
recipScaleFactor
=
RECIP
(
M_PI
*
periodicBoxSize
.
x
*
periodicBoxSize
.
y
*
periodicBoxSize
.
z
);
const
real
recipScaleFactor
=
RECIP
(
M_PI
*
periodicBoxSize
.
x
*
periodicBoxSize
.
y
*
periodicBoxSize
.
z
);
#endif
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// real indices
// real indices
...
@@ -140,12 +158,23 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict
...
@@ -140,12 +158,23 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict
real
bz
=
pmeBsplineModuliZ
[
kz
];
real
bz
=
pmeBsplineModuliZ
[
kz
];
real2
grid
=
halfcomplex_pmeGrid
[
index
];
real2
grid
=
halfcomplex_pmeGrid
[
index
];
real
m2
=
mhx
*
mhx
+
mhy
*
mhy
+
mhz
*
mhz
;
real
m2
=
mhx
*
mhx
+
mhy
*
mhy
+
mhz
*
mhz
;
#ifdef USE_LJPME
real
denom
=
recipScaleFactor
/
(
bx
*
by
*
bz
);
real
m
=
SQRT
(
m2
);
real
m3
=
m
*
m2
;
real
b
=
bfac
*
m
;
real
expfac
=
-
b
*
b
;
real
expterm
=
EXP
(
expfac
);
real
erfcterm
=
ERFC
(
b
);
real
eterm
=
(
fac1
*
erfcterm
*
m3
+
expterm
*
(
fac2
+
fac3
*
m2
))
*
denom
;
halfcomplex_pmeGrid
[
index
]
=
make_real2
(
grid
.
x
*
eterm
,
grid
.
y
*
eterm
);
#else
real
denom
=
m2
*
bx
*
by
*
bz
;
real
denom
=
m2
*
bx
*
by
*
bz
;
real
eterm
=
recipScaleFactor
*
EXP
(
-
RECIP_EXP_FACTOR
*
m2
)
/
denom
;
real
eterm
=
recipScaleFactor
*
EXP
(
-
RECIP_EXP_FACTOR
*
m2
)
/
denom
;
if
(
kx
!=
0
||
ky
!=
0
||
kz
!=
0
)
{
if
(
kx
!=
0
||
ky
!=
0
||
kz
!=
0
)
{
halfcomplex_pmeGrid
[
index
]
=
make_real2
(
grid
.
x
*
eterm
,
grid
.
y
*
eterm
);
halfcomplex_pmeGrid
[
index
]
=
make_real2
(
grid
.
x
*
eterm
,
grid
.
y
*
eterm
);
}
}
#endif
}
}
}
}
...
@@ -156,8 +185,16 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
...
@@ -156,8 +185,16 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
real4
periodicBoxSize
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
real4
periodicBoxSize
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
)
{
// R2C stores into a half complex matrix where the last dimension is cut by half
// R2C stores into a half complex matrix where the last dimension is cut by half
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
const
unsigned
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
#ifdef USE_LJPME
const
real
recipScaleFactor
=
-
2
*
M_PI
*
SQRT
(
M_PI
)
*
RECIP
(
6
*
periodicBoxSize
.
x
*
periodicBoxSize
.
y
*
periodicBoxSize
.
z
);
real
bfac
=
M_PI
/
EWALD_ALPHA
;
real
fac1
=
2
*
M_PI
*
M_PI
*
M_PI
*
SQRT
(
M_PI
);
real
fac2
=
EWALD_ALPHA
*
EWALD_ALPHA
*
EWALD_ALPHA
;
real
fac3
=
-
2
*
EWALD_ALPHA
*
M_PI
*
M_PI
;
#else
const
real
recipScaleFactor
=
RECIP
(
M_PI
*
periodicBoxSize
.
x
*
periodicBoxSize
.
y
*
periodicBoxSize
.
z
);
const
real
recipScaleFactor
=
RECIP
(
M_PI
*
periodicBoxSize
.
x
*
periodicBoxSize
.
y
*
periodicBoxSize
.
z
);
#endif
mixed
energy
=
0
;
mixed
energy
=
0
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
gridSize
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// real indices
// real indices
...
@@ -175,8 +212,19 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
...
@@ -175,8 +212,19 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
real
bx
=
pmeBsplineModuliX
[
kx
];
real
bx
=
pmeBsplineModuliX
[
kx
];
real
by
=
pmeBsplineModuliY
[
ky
];
real
by
=
pmeBsplineModuliY
[
ky
];
real
bz
=
pmeBsplineModuliZ
[
kz
];
real
bz
=
pmeBsplineModuliZ
[
kz
];
#ifdef USE_LJPME
real
denom
=
recipScaleFactor
/
(
bx
*
by
*
bz
);
real
m
=
SQRT
(
m2
);
real
m3
=
m
*
m2
;
real
b
=
bfac
*
m
;
real
expfac
=
-
b
*
b
;
real
expterm
=
EXP
(
expfac
);
real
erfcterm
=
ERFC
(
b
);
real
eterm
=
(
fac1
*
erfcterm
*
m3
+
expterm
*
(
fac2
+
fac3
*
m2
))
*
denom
;
#else
real
denom
=
m2
*
bx
*
by
*
bz
;
real
denom
=
m2
*
bx
*
by
*
bz
;
real
eterm
=
recipScaleFactor
*
EXP
(
-
RECIP_EXP_FACTOR
*
m2
)
/
denom
;
real
eterm
=
recipScaleFactor
*
EXP
(
-
RECIP_EXP_FACTOR
*
m2
)
/
denom
;
#endif
if
(
kz
>=
(
GRID_SIZE_Z
/
2
+
1
))
{
if
(
kz
>=
(
GRID_SIZE_Z
/
2
+
1
))
{
kx
=
((
kx
==
0
)
?
kx
:
GRID_SIZE_X
-
kx
);
kx
=
((
kx
==
0
)
?
kx
:
GRID_SIZE_X
-
kx
);
...
@@ -185,9 +233,10 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
...
@@ -185,9 +233,10 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
}
}
int
indexInHalfComplexGrid
=
kz
+
ky
*
(
GRID_SIZE_Z
/
2
+
1
)
+
kx
*
(
GRID_SIZE_Y
*
(
GRID_SIZE_Z
/
2
+
1
));
int
indexInHalfComplexGrid
=
kz
+
ky
*
(
GRID_SIZE_Z
/
2
+
1
)
+
kx
*
(
GRID_SIZE_Y
*
(
GRID_SIZE_Z
/
2
+
1
));
real2
grid
=
halfcomplex_pmeGrid
[
indexInHalfComplexGrid
];
real2
grid
=
halfcomplex_pmeGrid
[
indexInHalfComplexGrid
];
if
(
kx
!=
0
||
ky
!=
0
||
kz
!=
0
)
{
#ifndef USE_LJPME
if
(
kx
!=
0
||
ky
!=
0
||
kz
!=
0
)
#endif
energy
+=
eterm
*
(
grid
.
x
*
grid
.
x
+
grid
.
y
*
grid
.
y
);
energy
+=
eterm
*
(
grid
.
x
*
grid
.
x
+
grid
.
y
*
grid
.
y
);
}
}
}
#ifdef USE_PME_STREAM
#ifdef USE_PME_STREAM
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
=
0.5
f
*
energy
;
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
=
0.5
f
*
energy
;
...
@@ -199,7 +248,11 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
...
@@ -199,7 +248,11 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__
extern
"C"
__global__
extern
"C"
__global__
void
gridInterpolateForce
(
const
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
forceBuffers
,
const
real
*
__restrict__
originalPmeGrid
,
void
gridInterpolateForce
(
const
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
forceBuffers
,
const
real
*
__restrict__
originalPmeGrid
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
real4
periodicBoxVecX
,
real4
periodicBoxVecY
,
real4
periodicBoxVecZ
,
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
const
int2
*
__restrict__
pmeAtomGridIndex
)
{
real3
recipBoxVecX
,
real3
recipBoxVecY
,
real3
recipBoxVecZ
,
const
int2
*
__restrict__
pmeAtomGridIndex
#ifdef USE_LJPME
,
const
real2
*
__restrict__
sigmaEpsilon
#endif
)
{
real3
data
[
PME_ORDER
];
real3
data
[
PME_ORDER
];
real3
ddata
[
PME_ORDER
];
real3
ddata
[
PME_ORDER
];
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
...
@@ -271,7 +324,12 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
...
@@ -271,7 +324,12 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
}
}
}
}
}
}
#ifdef USE_LJPME
const
real2
sigEps
=
sigmaEpsilon
[
atom
];
real
q
=
8
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
x
*
sigEps
.
y
;
#else
real
q
=
pos
.
w
*
EPSILON_FACTOR
;
real
q
=
pos
.
w
*
EPSILON_FACTOR
;
#endif
real
forceX
=
-
q
*
(
force
.
x
*
GRID_SIZE_X
*
recipBoxVecX
.
x
);
real
forceX
=
-
q
*
(
force
.
x
*
GRID_SIZE_X
*
recipBoxVecX
.
x
);
real
forceY
=
-
q
*
(
force
.
x
*
GRID_SIZE_X
*
recipBoxVecY
.
x
+
force
.
y
*
GRID_SIZE_Y
*
recipBoxVecY
.
y
);
real
forceY
=
-
q
*
(
force
.
x
*
GRID_SIZE_X
*
recipBoxVecY
.
x
+
force
.
y
*
GRID_SIZE_Y
*
recipBoxVecY
.
y
);
real
forceZ
=
-
q
*
(
force
.
x
*
GRID_SIZE_X
*
recipBoxVecZ
.
x
+
force
.
y
*
GRID_SIZE_Y
*
recipBoxVecZ
.
y
+
force
.
z
*
GRID_SIZE_Z
*
recipBoxVecZ
.
z
);
real
forceZ
=
-
q
*
(
force
.
x
*
GRID_SIZE_X
*
recipBoxVecZ
.
x
+
force
.
y
*
GRID_SIZE_Y
*
recipBoxVecZ
.
y
+
force
.
z
*
GRID_SIZE_Z
*
recipBoxVecZ
.
z
);
...
...
platforms/cuda/src/kernels/sort.cu
View file @
3b6925ae
...
@@ -52,7 +52,8 @@ __global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length)
...
@@ -52,7 +52,8 @@ __global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length)
*/
*/
__global__
void
computeRange
(
const
DATA_TYPE
*
__restrict__
data
,
unsigned
int
length
,
KEY_TYPE
*
__restrict__
range
,
__global__
void
computeRange
(
const
DATA_TYPE
*
__restrict__
data
,
unsigned
int
length
,
KEY_TYPE
*
__restrict__
range
,
unsigned
int
numBuckets
,
unsigned
int
*
__restrict__
bucketOffset
)
{
unsigned
int
numBuckets
,
unsigned
int
*
__restrict__
bucketOffset
)
{
extern
__shared__
KEY_TYPE
rangeBuffer
[];
extern
__shared__
KEY_TYPE
minBuffer
[];
KEY_TYPE
*
maxBuffer
=
minBuffer
+
blockDim
.
x
;
KEY_TYPE
minimum
=
MAX_KEY
;
KEY_TYPE
minimum
=
MAX_KEY
;
KEY_TYPE
maximum
=
MIN_KEY
;
KEY_TYPE
maximum
=
MIN_KEY
;
...
@@ -66,23 +67,18 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le
...
@@ -66,23 +67,18 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le
// Now reduce them.
// Now reduce them.
rangeBuffer
[
threadIdx
.
x
]
=
minimum
;
minBuffer
[
threadIdx
.
x
]
=
minimum
;
maxBuffer
[
threadIdx
.
x
]
=
maximum
;
__syncthreads
();
__syncthreads
();
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
threadIdx
.
x
+
step
<
blockDim
.
x
&&
threadIdx
.
x
%
(
2
*
step
)
==
0
)
if
(
threadIdx
.
x
+
step
<
blockDim
.
x
&&
threadIdx
.
x
%
(
2
*
step
)
==
0
)
{
rangeBuffer
[
threadIdx
.
x
]
=
min
(
rangeBuffer
[
threadIdx
.
x
],
rangeBuffer
[
threadIdx
.
x
+
step
]);
minBuffer
[
threadIdx
.
x
]
=
min
(
minBuffer
[
threadIdx
.
x
],
minBuffer
[
threadIdx
.
x
+
step
]);
__syncthreads
();
maxBuffer
[
threadIdx
.
x
]
=
max
(
maxBuffer
[
threadIdx
.
x
],
maxBuffer
[
threadIdx
.
x
+
step
]);
}
}
minimum
=
rangeBuffer
[
0
];
__syncthreads
();
rangeBuffer
[
threadIdx
.
x
]
=
maximum
;
__syncthreads
();
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
threadIdx
.
x
+
step
<
blockDim
.
x
&&
threadIdx
.
x
%
(
2
*
step
)
==
0
)
rangeBuffer
[
threadIdx
.
x
]
=
max
(
rangeBuffer
[
threadIdx
.
x
],
rangeBuffer
[
threadIdx
.
x
+
step
]);
__syncthreads
();
__syncthreads
();
}
}
maximum
=
rangeBuffer
[
0
];
minimum
=
minBuffer
[
0
];
maximum
=
maxBuffer
[
0
];
if
(
threadIdx
.
x
==
0
)
{
if
(
threadIdx
.
x
==
0
)
{
range
[
0
]
=
minimum
;
range
[
0
]
=
minimum
;
range
[
1
]
=
maximum
;
range
[
1
]
=
maximum
;
...
@@ -98,7 +94,7 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le
...
@@ -98,7 +94,7 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le
* Assign elements to buckets.
* Assign elements to buckets.
*/
*/
__global__
void
assignElementsToBuckets
(
const
DATA_TYPE
*
__restrict__
data
,
unsigned
int
length
,
unsigned
int
numBuckets
,
const
KEY_TYPE
*
__restrict__
range
,
__global__
void
assignElementsToBuckets
(
const
DATA_TYPE
*
__restrict__
data
,
unsigned
int
length
,
unsigned
int
numBuckets
,
const
KEY_TYPE
*
__restrict__
range
,
unsigned
int
*
bucketOffset
,
unsigned
int
*
__restrict__
bucketOfElement
,
unsigned
int
*
__restrict__
offsetInBucket
)
{
unsigned
int
*
__restrict__
bucketOffset
,
unsigned
int
*
__restrict__
bucketOfElement
,
unsigned
int
*
__restrict__
offsetInBucket
)
{
float
minValue
=
(
float
)
(
range
[
0
]);
float
minValue
=
(
float
)
(
range
[
0
]);
float
maxValue
=
(
float
)
(
range
[
1
]);
float
maxValue
=
(
float
)
(
range
[
1
]);
float
bucketWidth
=
(
maxValue
-
minValue
)
/
numBuckets
;
float
bucketWidth
=
(
maxValue
-
minValue
)
/
numBuckets
;
...
...
platforms/opencl/include/OpenCLKernels.h
View file @
3b6925ae
...
@@ -607,13 +607,22 @@ public:
...
@@ -607,13 +607,22 @@ public:
void
copyParametersToContext
(
ContextImpl
&
context
,
const
NonbondedForce
&
force
);
void
copyParametersToContext
(
ContextImpl
&
context
,
const
NonbondedForce
&
force
);
/**
/**
* Get the parameters being used for PME.
* Get the parameters being used for PME.
*
*
* @param alpha the separation parameter
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
* @param nz the number of grid points along the Z axis
*/
*/
void
getPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
;
void
getPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
;
/**
* Get the parameters being used for the dispersion term in LJPME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void
getLJPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
;
private:
private:
class
SortTrait
:
public
OpenCLSort
::
SortTrait
{
class
SortTrait
:
public
OpenCLSort
::
SortTrait
{
int
getDataSize
()
const
{
return
8
;}
int
getDataSize
()
const
{
return
8
;}
...
@@ -664,8 +673,9 @@ private:
...
@@ -664,8 +673,9 @@ private:
cl
::
Kernel
pmeInterpolateForceKernel
;
cl
::
Kernel
pmeInterpolateForceKernel
;
std
::
map
<
std
::
string
,
std
::
string
>
pmeDefines
;
std
::
map
<
std
::
string
,
std
::
string
>
pmeDefines
;
std
::
vector
<
std
::
pair
<
int
,
int
>
>
exceptionAtoms
;
std
::
vector
<
std
::
pair
<
int
,
int
>
>
exceptionAtoms
;
double
ewaldSelfEnergy
,
dispersionCoefficient
,
alpha
;
double
ewaldSelfEnergy
,
dispersionCoefficient
,
alpha
,
dispersionAlpha
;
int
gridSizeX
,
gridSizeY
,
gridSizeZ
;
int
gridSizeX
,
gridSizeY
,
gridSizeZ
;
int
dispersionGridSizeX
,
dispersionGridSizeY
,
dispersionGridSizeZ
;
bool
hasCoulomb
,
hasLJ
,
usePmeQueue
;
bool
hasCoulomb
,
hasLJ
,
usePmeQueue
;
NonbondedMethod
nonbondedMethod
;
NonbondedMethod
nonbondedMethod
;
static
const
int
PmeOrder
=
5
;
static
const
int
PmeOrder
=
5
;
...
@@ -1419,7 +1429,7 @@ private:
...
@@ -1419,7 +1429,7 @@ private:
void
prepareForComputation
(
ContextImpl
&
context
,
CustomIntegrator
&
integrator
,
bool
&
forcesAreValid
);
void
prepareForComputation
(
ContextImpl
&
context
,
CustomIntegrator
&
integrator
,
bool
&
forcesAreValid
);
Lepton
::
ExpressionTreeNode
replaceDerivFunctions
(
const
Lepton
::
ExpressionTreeNode
&
node
,
OpenMM
::
ContextImpl
&
context
);
Lepton
::
ExpressionTreeNode
replaceDerivFunctions
(
const
Lepton
::
ExpressionTreeNode
&
node
,
OpenMM
::
ContextImpl
&
context
);
void
findExpressionsForDerivs
(
const
Lepton
::
ExpressionTreeNode
&
node
,
std
::
vector
<
std
::
pair
<
Lepton
::
ExpressionTreeNode
,
std
::
string
>
>&
variableNodes
);
void
findExpressionsForDerivs
(
const
Lepton
::
ExpressionTreeNode
&
node
,
std
::
vector
<
std
::
pair
<
Lepton
::
ExpressionTreeNode
,
std
::
string
>
>&
variableNodes
);
void
recordGlobalValue
(
double
value
,
GlobalTarget
target
);
void
recordGlobalValue
(
double
value
,
GlobalTarget
target
,
CustomIntegrator
&
integrator
);
void
recordChangedParameters
(
ContextImpl
&
context
);
void
recordChangedParameters
(
ContextImpl
&
context
);
bool
evaluateCondition
(
int
step
);
bool
evaluateCondition
(
int
step
);
OpenCLContext
&
cl
;
OpenCLContext
&
cl
;
...
...
platforms/opencl/include/OpenCLParallelKernels.h
View file @
3b6925ae
...
@@ -431,13 +431,22 @@ public:
...
@@ -431,13 +431,22 @@ public:
void
copyParametersToContext
(
ContextImpl
&
context
,
const
NonbondedForce
&
force
);
void
copyParametersToContext
(
ContextImpl
&
context
,
const
NonbondedForce
&
force
);
/**
/**
* Get the parameters being used for PME.
* Get the parameters being used for PME.
*
*
* @param alpha the separation parameter
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
* @param nz the number of grid points along the Z axis
*/
*/
void
getPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
;
void
getPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
;
/**
* Get the parameters being used for the dispersion term in LJPME.
*
* @param alpha the separation parameter
* @param nx the number of grid points along the X axis
* @param ny the number of grid points along the Y axis
* @param nz the number of grid points along the Z axis
*/
void
getLJPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
;
private:
private:
class
Task
;
class
Task
;
OpenCLPlatform
::
PlatformData
&
data
;
OpenCLPlatform
::
PlatformData
&
data
;
...
...
platforms/opencl/src/OpenCLKernels.cpp
View file @
3b6925ae
This diff is collapsed.
Click to expand it.
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
3b6925ae
...
@@ -511,6 +511,8 @@ void OpenCLNonbondedUtilities::createKernelsForGroups(int groups) {
...
@@ -511,6 +511,8 @@ void OpenCLNonbondedUtilities::createKernelsForGroups(int groups) {
defines
[
"SIMD_WIDTH"
]
=
context
.
intToString
(
context
.
getSIMDWidth
());
defines
[
"SIMD_WIDTH"
]
=
context
.
intToString
(
context
.
getSIMDWidth
());
if
(
usePeriodic
)
if
(
usePeriodic
)
defines
[
"USE_PERIODIC"
]
=
"1"
;
defines
[
"USE_PERIODIC"
]
=
"1"
;
if
(
context
.
getBoxIsTriclinic
())
defines
[
"TRICLINIC"
]
=
"1"
;
defines
[
"MAX_EXCLUSIONS"
]
=
context
.
intToString
(
maxExclusions
);
defines
[
"MAX_EXCLUSIONS"
]
=
context
.
intToString
(
maxExclusions
);
defines
[
"BUFFER_GROUPS"
]
=
(
deviceIsCpu
?
"4"
:
"2"
);
defines
[
"BUFFER_GROUPS"
]
=
(
deviceIsCpu
?
"4"
:
"2"
);
string
file
=
(
deviceIsCpu
?
OpenCLKernelSources
::
findInteractingBlocks_cpu
:
OpenCLKernelSources
::
findInteractingBlocks
);
string
file
=
(
deviceIsCpu
?
OpenCLKernelSources
::
findInteractingBlocks_cpu
:
OpenCLKernelSources
::
findInteractingBlocks
);
...
...
platforms/opencl/src/OpenCLParallelKernels.cpp
View file @
3b6925ae
...
@@ -583,6 +583,10 @@ void OpenCLParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int
...
@@ -583,6 +583,10 @@ void OpenCLParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int
dynamic_cast
<
const
OpenCLCalcNonbondedForceKernel
&>
(
kernels
[
0
].
getImpl
()).
getPMEParameters
(
alpha
,
nx
,
ny
,
nz
);
dynamic_cast
<
const
OpenCLCalcNonbondedForceKernel
&>
(
kernels
[
0
].
getImpl
()).
getPMEParameters
(
alpha
,
nx
,
ny
,
nz
);
}
}
void
OpenCLParallelCalcNonbondedForceKernel
::
getLJPMEParameters
(
double
&
alpha
,
int
&
nx
,
int
&
ny
,
int
&
nz
)
const
{
dynamic_cast
<
const
OpenCLCalcNonbondedForceKernel
&>
(
kernels
[
0
].
getImpl
()).
getLJPMEParameters
(
alpha
,
nx
,
ny
,
nz
);
}
class
OpenCLParallelCalcCustomNonbondedForceKernel
::
Task
:
public
OpenCLContext
::
WorkTask
{
class
OpenCLParallelCalcCustomNonbondedForceKernel
::
Task
:
public
OpenCLContext
::
WorkTask
{
public:
public:
Task
(
ContextImpl
&
context
,
OpenCLCalcCustomNonbondedForceKernel
&
kernel
,
bool
includeForce
,
Task
(
ContextImpl
&
context
,
OpenCLCalcCustomNonbondedForceKernel
&
kernel
,
bool
includeForce
,
...
...
Prev
1
2
3
4
5
6
7
8
9
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment