Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
93c467b2
Commit
93c467b2
authored
Mar 22, 2013
by
Peter Eastman
Browse files
Merged 5.1Optimizations branch back to trunk
parent
f6d4557d
Changes
86
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1724 additions
and
1166 deletions
+1724
-1166
platforms/cuda/src/CudaContext.cpp
platforms/cuda/src/CudaContext.cpp
+2
-1
platforms/cuda/src/CudaContext.h
platforms/cuda/src/CudaContext.h
+2
-0
platforms/cuda/src/CudaIntegrationUtilities.cpp
platforms/cuda/src/CudaIntegrationUtilities.cpp
+14
-16
platforms/cuda/src/CudaIntegrationUtilities.h
platforms/cuda/src/CudaIntegrationUtilities.h
+2
-3
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+99
-71
platforms/cuda/src/CudaKernels.h
platforms/cuda/src/CudaKernels.h
+3
-7
platforms/cuda/src/CudaNonbondedUtilities.cpp
platforms/cuda/src/CudaNonbondedUtilities.cpp
+171
-100
platforms/cuda/src/CudaNonbondedUtilities.h
platforms/cuda/src/CudaNonbondedUtilities.h
+35
-23
platforms/cuda/src/CudaParallelKernels.cpp
platforms/cuda/src/CudaParallelKernels.cpp
+15
-17
platforms/cuda/src/CudaParallelKernels.h
platforms/cuda/src/CudaParallelKernels.h
+2
-2
platforms/cuda/src/CudaSort.cpp
platforms/cuda/src/CudaSort.cpp
+42
-32
platforms/cuda/src/CudaSort.h
platforms/cuda/src/CudaSort.h
+3
-2
platforms/cuda/src/kernels/coulombLennardJones.cu
platforms/cuda/src/kernels/coulombLennardJones.cu
+3
-3
platforms/cuda/src/kernels/customGBEnergyN2.cu
platforms/cuda/src/kernels/customGBEnergyN2.cu
+267
-134
platforms/cuda/src/kernels/customGBValueN2.cu
platforms/cuda/src/kernels/customGBValueN2.cu
+223
-168
platforms/cuda/src/kernels/customHbondForce.cu
platforms/cuda/src/kernels/customHbondForce.cu
+2
-2
platforms/cuda/src/kernels/ewald.cu
platforms/cuda/src/kernels/ewald.cu
+6
-6
platforms/cuda/src/kernels/findInteractingBlocks.cu
platforms/cuda/src/kernels/findInteractingBlocks.cu
+261
-143
platforms/cuda/src/kernels/gbsaObc1.cu
platforms/cuda/src/kernels/gbsaObc1.cu
+539
-408
platforms/cuda/src/kernels/integrationUtilities.cu
platforms/cuda/src/kernels/integrationUtilities.cu
+33
-28
No files found.
platforms/cuda/src/CudaContext.cpp
View file @
93c467b2
...
...
@@ -61,7 +61,7 @@ using namespace OpenMM;
using
namespace
std
;
const
int
CudaContext
::
ThreadBlockSize
=
64
;
const
int
CudaContext
::
TileSize
=
32
;
const
int
CudaContext
::
TileSize
=
sizeof
(
tileflags
)
*
8
;
bool
CudaContext
::
hasInitializedCuda
=
false
;
CudaContext
::
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
string
&
precision
,
const
string
&
compiler
,
...
...
@@ -369,6 +369,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
src
<<
"typedef float3 mixed3;
\n
"
;
src
<<
"typedef float4 mixed4;
\n
"
;
}
src
<<
"typedef unsigned int tileflags;
\n
"
;
for
(
map
<
string
,
string
>::
const_iterator
iter
=
defines
.
begin
();
iter
!=
defines
.
end
();
++
iter
)
{
src
<<
"#define "
<<
iter
->
first
;
if
(
!
iter
->
second
.
empty
())
...
...
platforms/cuda/src/CudaContext.h
View file @
93c467b2
...
...
@@ -42,6 +42,8 @@
#include "windowsExportCuda.h"
#include "CudaPlatform.h"
typedef
unsigned
int
tileflags
;
namespace
OpenMM
{
class
CudaArray
;
...
...
platforms/cuda/src/CudaIntegrationUtilities.cpp
View file @
93c467b2
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -99,7 +99,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
posDelta
(
NULL
),
settleAtoms
(
NULL
),
settleParams
(
NULL
),
shakeAtoms
(
NULL
),
shakeParams
(
NULL
),
random
(
NULL
),
randomSeed
(
NULL
),
randomPos
(
0
),
stepSize
(
NULL
),
ccmaAtoms
(
NULL
),
ccmaDistance
(
NULL
),
ccmaReducedMass
(
NULL
),
ccmaAtomConstraints
(
NULL
),
ccmaNumAtomConstraints
(
NULL
),
ccmaConstraintMatrixColumn
(
NULL
),
ccmaConstraintMatrixValue
(
NULL
),
ccmaDelta1
(
NULL
),
ccmaDelta2
(
NULL
),
ccmaConverged
Memory
(
NULL
),
ccmaConstraintMatrixValue
(
NULL
),
ccmaDelta1
(
NULL
),
ccmaDelta2
(
NULL
),
ccmaConverged
(
NULL
),
vsite2AvgAtoms
(
NULL
),
vsite2AvgWeights
(
NULL
),
vsite3AvgAtoms
(
NULL
),
vsite3AvgWeights
(
NULL
),
vsiteOutOfPlaneAtoms
(
NULL
),
vsiteOutOfPlaneWeights
(
NULL
)
{
// Create workspace arrays.
...
...
@@ -466,9 +466,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
ccmaAtoms
=
CudaArray
::
create
<
int2
>
(
context
,
numCCMA
,
"CcmaAtoms"
);
ccmaAtomConstraints
=
CudaArray
::
create
<
int
>
(
context
,
numAtoms
*
maxAtomConstraints
,
"CcmaAtomConstraints"
);
ccmaNumAtomConstraints
=
CudaArray
::
create
<
int
>
(
context
,
numAtoms
,
"CcmaAtomConstraintsIndex"
);
CHECK_RESULT2
(
cuMemHostAlloc
((
void
**
)
&
ccmaConvergedMemory
,
2
*
sizeof
(
int
),
CU_MEMHOSTALLOC_DEVICEMAP
),
"Error allocating pinned memory"
);
CHECK_RESULT2
(
cuMemHostGetDevicePointer
(
&
ccmaConvergedDeviceMemory
,
ccmaConvergedMemory
,
0
),
"Error getting device address for pinned memory"
);
ccmaConstraintMatrixColumn
=
CudaArray
::
create
<
int
>
(
context
,
numCCMA
*
maxRowElements
,
"ConstraintMatrixColumn"
);
ccmaConverged
=
CudaArray
::
create
<
int
>
(
context
,
2
,
"ccmaConverged"
);
vector
<
int2
>
atomsVec
(
ccmaAtoms
->
getSize
());
vector
<
int
>
atomConstraintsVec
(
ccmaAtomConstraints
->
getSize
());
vector
<
int
>
numAtomConstraintsVec
(
ccmaNumAtomConstraints
->
getSize
());
...
...
@@ -680,8 +679,8 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
delete
ccmaDelta1
;
if
(
ccmaDelta2
!=
NULL
)
delete
ccmaDelta2
;
if
(
ccmaConverged
Memory
!=
NULL
)
cuMemFreeHost
(
ccmaConverged
Memory
)
;
if
(
ccmaConverged
!=
NULL
)
delete
ccmaConverged
;
if
(
vsite2AvgAtoms
!=
NULL
)
delete
vsite2AvgAtoms
;
if
(
vsite2AvgWeights
!=
NULL
)
...
...
@@ -734,33 +733,32 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
context
.
executeKernel
(
shakeKernel
,
args
,
shakeAtoms
->
getSize
());
}
if
(
ccmaAtoms
!=
NULL
)
{
void
*
directionsArgs
[]
=
{
&
ccmaAtoms
->
getDevicePointer
(),
&
ccmaDistance
->
getDevicePointer
(),
&
context
.
getPosq
().
getDevicePointer
(),
&
posCorrection
};
void
*
directionsArgs
[]
=
{
&
ccmaAtoms
->
getDevicePointer
(),
&
ccmaDistance
->
getDevicePointer
(),
&
context
.
getPosq
().
getDevicePointer
(),
&
posCorrection
,
&
ccmaConverged
->
getDevicePointer
()
};
context
.
executeKernel
(
ccmaDirectionsKernel
,
directionsArgs
,
ccmaAtoms
->
getSize
());
int
i
;
void
*
forceArgs
[]
=
{
&
ccmaAtoms
->
getDevicePointer
(),
&
ccmaDistance
->
getDevicePointer
(),
constrainVelocities
?
&
context
.
getVelm
().
getDevicePointer
()
:
&
posDelta
->
getDevicePointer
(),
&
ccmaReducedMass
->
getDevicePointer
(),
&
ccmaDelta1
->
getDevicePointer
(),
&
ccmaConvergedDevice
Memory
,
&
ccmaReducedMass
->
getDevicePointer
(),
&
ccmaDelta1
->
getDevicePointer
(),
&
ccmaConverged
->
get
Device
Pointer
()
,
tolPointer
,
&
i
};
void
*
multiplyArgs
[]
=
{
&
ccmaDelta1
->
getDevicePointer
(),
&
ccmaDelta2
->
getDevicePointer
(),
&
ccmaConstraintMatrixColumn
->
getDevicePointer
(),
&
ccmaConstraintMatrixValue
->
getDevicePointer
(),
&
ccmaConvergedDevice
Memory
,
&
i
};
&
ccmaConstraintMatrixColumn
->
getDevicePointer
(),
&
ccmaConstraintMatrixValue
->
getDevicePointer
(),
&
ccmaConverged
->
get
Device
Pointer
()
,
&
i
};
void
*
updateArgs
[]
=
{
&
ccmaNumAtomConstraints
->
getDevicePointer
(),
&
ccmaAtomConstraints
->
getDevicePointer
(),
&
ccmaDistance
->
getDevicePointer
(),
constrainVelocities
?
&
context
.
getVelm
().
getDevicePointer
()
:
&
posDelta
->
getDevicePointer
(),
&
context
.
getVelm
().
getDevicePointer
(),
&
ccmaDelta1
->
getDevicePointer
(),
&
ccmaDelta2
->
getDevicePointer
(),
&
ccmaConvergedDevice
Memory
,
&
i
};
&
ccmaConverged
->
get
Device
Pointer
()
,
&
i
};
const
int
checkInterval
=
4
;
int
*
converged
=
(
int
*
)
context
.
getPinnedBuffer
();
for
(
i
=
0
;
i
<
150
;
i
++
)
{
if
(
i
==
0
)
{
ccmaConvergedMemory
[
0
]
=
1
;
ccmaConvergedMemory
[
1
]
=
0
;
}
context
.
executeKernel
(
ccmaForceKernel
,
forceArgs
,
ccmaAtoms
->
getSize
());
if
((
i
+
1
)
%
checkInterval
==
0
)
if
((
i
+
1
)
%
checkInterval
==
0
)
{
ccmaConverged
->
download
(
converged
,
false
);
CHECK_RESULT2
(
cuEventRecord
(
ccmaEvent
,
0
),
"Error recording event for CCMA"
);
}
context
.
executeKernel
(
ccmaMultiplyKernel
,
multiplyArgs
,
ccmaAtoms
->
getSize
());
context
.
executeKernel
(
ccmaUpdateKernel
,
updateArgs
,
context
.
getNumAtoms
());
if
((
i
+
1
)
%
checkInterval
==
0
)
{
CHECK_RESULT2
(
cuEventSynchronize
(
ccmaEvent
),
"Error synchronizing on event for CCMA"
);
if
(
c
cmaC
onverged
Memory
[
i
%
2
])
if
(
converged
[
i
%
2
])
break
;
}
}
...
...
platforms/cuda/src/CudaIntegrationUtilities.h
View file @
93c467b2
...
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -140,8 +140,7 @@ private:
CudaArray
*
ccmaConstraintMatrixValue
;
CudaArray
*
ccmaDelta1
;
CudaArray
*
ccmaDelta2
;
int
*
ccmaConvergedMemory
;
CUdeviceptr
ccmaConvergedDeviceMemory
;
CudaArray
*
ccmaConverged
;
CUevent
ccmaEvent
;
CudaArray
*
vsite2AvgAtoms
;
CudaArray
*
vsite2AvgWeights
;
...
...
platforms/cuda/src/CudaKernels.cpp
View file @
93c467b2
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2008-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -1351,10 +1351,6 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
delete
pmeBsplineModuliY
;
if
(
pmeBsplineModuliZ
!=
NULL
)
delete
pmeBsplineModuliZ
;
if
(
pmeBsplineTheta
!=
NULL
)
delete
pmeBsplineTheta
;
if
(
pmeBsplineDTheta
!=
NULL
)
delete
pmeBsplineDTheta
;
if
(
pmeAtomRange
!=
NULL
)
delete
pmeAtomRange
;
if
(
pmeAtomGridIndex
!=
NULL
)
...
...
@@ -1507,13 +1503,13 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
if
(
cu
.
getUseDoublePrecision
())
pmeDefines
[
"USE_DOUBLE_PRECISION"
]
=
"1"
;
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
pme
,
pmeDefines
);
pmeUpdateBsplinesKernel
=
cu
.
getKernel
(
module
,
"updateBsplines"
);
pmeAtomRangeKernel
=
cu
.
getKernel
(
module
,
"findAtomRangeForGrid"
);
pmeGridIndexKernel
=
cu
.
getKernel
(
module
,
"findAtomGridIndex"
);
pmeSpreadChargeKernel
=
cu
.
getKernel
(
module
,
"gridSpreadCharge"
);
pmeConvolutionKernel
=
cu
.
getKernel
(
module
,
"reciprocalConvolution"
);
pmeInterpolateForceKernel
=
cu
.
getKernel
(
module
,
"gridInterpolateForce"
);
pmeEvalEnergyKernel
=
cu
.
getKernel
(
module
,
"gridEvaluateEnergy"
);
pmeFinishSpreadChargeKernel
=
cu
.
getKernel
(
module
,
"finishSpreadCharge"
);
cuFuncSetCacheConfig
(
pmeSpreadChargeKernel
,
CU_FUNC_CACHE_PREFER_L1
);
cuFuncSetCacheConfig
(
pmeInterpolateForceKernel
,
CU_FUNC_CACHE_PREFER_L1
);
// Create required data structures.
...
...
@@ -1528,7 +1524,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeBsplineModuliX
=
new
CudaArray
(
cu
,
gridSizeX
,
elementSize
,
"pmeBsplineModuliX"
);
pmeBsplineModuliY
=
new
CudaArray
(
cu
,
gridSizeY
,
elementSize
,
"pmeBsplineModuliY"
);
pmeBsplineModuliZ
=
new
CudaArray
(
cu
,
gridSizeZ
,
elementSize
,
"pmeBsplineModuliZ"
);
pmeBsplineTheta
=
new
CudaArray
(
cu
,
PmeOrder
*
numParticles
,
4
*
elementSize
,
"pmeBsplineTheta"
);
pmeAtomRange
=
CudaArray
::
create
<
int
>
(
cu
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomGridIndex
=
CudaArray
::
create
<
int2
>
(
cu
,
numParticles
,
"pmeAtomGridIndex"
);
sort
=
new
CudaSort
(
cu
,
new
SortTrait
(),
cu
.
getNumAtoms
());
...
...
@@ -1659,20 +1654,14 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu
.
executeKernel
(
ewaldForcesKernel
,
forcesArgs
,
cu
.
getNumAtoms
());
}
if
(
directPmeGrid
!=
NULL
&&
cu
.
getContextIndex
()
==
0
&&
includeReciprocal
)
{
void
*
bsplinesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
pmeBsplineTheta
->
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
int
bsplinesSharedSize
=
cu
.
ThreadBlockSize
*
PmeOrder
*
(
cu
.
getUseDoublePrecision
()
?
sizeof
(
double4
)
:
sizeof
(
float4
));
cu
.
executeKernel
(
pmeUpdateBsplinesKernel
,
bsplinesArgs
,
cu
.
getNumAtoms
(),
cu
.
ThreadBlockSize
,
bsplinesSharedSize
);
void
*
gridIndexArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
pmeGridIndexKernel
,
gridIndexArgs
,
cu
.
getNumAtoms
());
sort
->
sort
(
*
pmeAtomGridIndex
);
void
*
rangeArgs
[]
=
{
&
pmeAtomGridIndex
->
getDevicePointer
(),
&
pmeAtomRange
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
pmeAtomRangeKernel
,
rangeArgs
,
cu
.
getNumAtoms
());
void
*
spreadArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
directPmeGrid
->
getDevicePointer
(),
&
pmeBsplineTheta
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
pmeSpreadChargeKernel
,
spreadArgs
,
cu
.
getNumAtoms
(),
PmeOrder
*
PmeOrder
*
PmeOrder
);
void
*
finishSpreadArgs
[]
=
{
&
directPmeGrid
->
getDevicePointer
()};
void
*
spreadArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
directPmeGrid
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
()};
cu
.
executeKernel
(
pmeSpreadChargeKernel
,
spreadArgs
,
cu
.
getNumAtoms
(),
128
);
if
(
cu
.
getUseDoublePrecision
()
||
cu
.
getComputeCapability
()
<
2.0
)
{
void
*
finishSpreadArgs
[]
=
{
&
directPmeGrid
->
getDevicePointer
()};
...
...
@@ -1699,8 +1688,8 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
void
*
interpolateArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
(),
&
directPmeGrid
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
pmeInterpolateForceKernel
,
interpolateArgs
,
cu
.
getNumAtoms
());
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()
,
&
pmeAtomGridIndex
->
getDevicePointer
()
};
cu
.
executeKernel
(
pmeInterpolateForceKernel
,
interpolateArgs
,
cu
.
getNumAtoms
()
,
128
);
}
double
energy
=
(
includeReciprocal
?
ewaldSelfEnergy
:
0.0
);
...
...
@@ -2071,6 +2060,14 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
defines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
cu
.
intToString
(
cu
.
getNumAtomBlocks
());
defines
[
"FORCE_WORK_GROUP_SIZE"
]
=
cu
.
intToString
(
nb
.
getForceThreadBlockSize
());
defines
[
"TILE_SIZE"
]
=
cu
.
intToString
(
CudaContext
::
TileSize
);
int
numExclusionTiles
=
nb
.
getExclusionTiles
().
getSize
();
defines
[
"NUM_TILES_WITH_EXCLUSIONS"
]
=
cu
.
intToString
(
numExclusionTiles
);
int
numContexts
=
cu
.
getPlatformData
().
contexts
.
size
();
int
startExclusionIndex
=
cu
.
getContextIndex
()
*
numExclusionTiles
/
numContexts
;
int
endExclusionIndex
=
(
cu
.
getContextIndex
()
+
1
)
*
numExclusionTiles
/
numContexts
;
defines
[
"FIRST_EXCLUSION_TILE"
]
=
cu
.
intToString
(
startExclusionIndex
);
defines
[
"LAST_EXCLUSION_TILE"
]
=
cu
.
intToString
(
endExclusionIndex
);
map
<
string
,
string
>
replacements
;
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
cu
.
replaceStrings
(
CudaKernelSources
::
gbsaObc1
,
replacements
),
defines
);
computeBornSumKernel
=
cu
.
getKernel
(
module
,
"computeBornSum"
);
...
...
@@ -2083,12 +2080,12 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
computeSumArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
computeSumArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
computeSumArgs
.
push_back
(
&
maxTiles
);
computeSumArgs
.
push_back
(
&
nb
.
getInteractionFlags
().
getDevicePointer
());
computeSumArgs
.
push_back
(
&
nb
.
getBlockCenters
().
getDevicePointer
());
computeSumArgs
.
push_back
(
&
nb
.
getInteractingAtoms
().
getDevicePointer
());
}
else
computeSumArgs
.
push_back
(
&
maxTiles
);
computeSumArgs
.
push_back
(
&
nb
.
getExclusionIndices
().
getDevicePointer
());
computeSumArgs
.
push_back
(
&
nb
.
getExclusionRowIndices
().
getDevicePointer
());
computeSumArgs
.
push_back
(
&
nb
.
getExclusionTiles
().
getDevicePointer
());
force1Kernel
=
cu
.
getKernel
(
module
,
"computeGBSAForce1"
);
force1Args
.
push_back
(
&
cu
.
getForce
().
getDevicePointer
());
force1Args
.
push_back
(
&
bornForce
->
getDevicePointer
());
...
...
@@ -2101,12 +2098,12 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
force1Args
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
force1Args
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
force1Args
.
push_back
(
&
maxTiles
);
force1Args
.
push_back
(
&
nb
.
getInteractionFlags
().
getDevicePointer
());
force1Args
.
push_back
(
&
nb
.
getBlockCenters
().
getDevicePointer
());
force1Args
.
push_back
(
&
nb
.
getInteractingAtoms
().
getDevicePointer
());
}
else
force1Args
.
push_back
(
&
maxTiles
);
force1Args
.
push_back
(
&
nb
.
getExclusionIndices
().
getDevicePointer
());
force1Args
.
push_back
(
&
nb
.
getExclusionRowIndices
().
getDevicePointer
());
force1Args
.
push_back
(
&
nb
.
getExclusionTiles
().
getDevicePointer
());
reduceBornSumKernel
=
cu
.
getKernel
(
module
,
"reduceBornSum"
);
reduceBornForceKernel
=
cu
.
getKernel
(
module
,
"reduceBornForce"
);
}
...
...
@@ -2115,8 +2112,8 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
computeSumArgs
[
3
]
=
&
nb
.
getInteractingTiles
().
getDevicePointer
();
force1Args
[
5
]
=
&
nb
.
getInteractingTiles
().
getDevicePointer
();
computeSumArgs
[
8
]
=
&
nb
.
getInteracti
onFlag
s
().
getDevicePointer
();
force1Args
[
1
0
]
=
&
nb
.
getInteracti
onFlag
s
().
getDevicePointer
();
computeSumArgs
[
9
]
=
&
nb
.
getInteracti
ngAtom
s
().
getDevicePointer
();
force1Args
[
1
1
]
=
&
nb
.
getInteracti
ngAtom
s
().
getDevicePointer
();
}
}
cu
.
executeKernel
(
computeBornSumKernel
,
&
computeSumArgs
[
0
],
nb
.
getNumForceThreadBlocks
()
*
nb
.
getForceThreadBlockSize
(),
nb
.
getForceThreadBlockSize
());
...
...
@@ -2244,16 +2241,17 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
// Record parameters and exclusions.
int
numParticles
=
force
.
getNumParticles
();
params
=
new
CudaParameterSet
(
cu
,
force
.
getNumPerParticleParameters
(),
numParticles
,
"customGBParameters"
,
true
);
computedValues
=
new
CudaParameterSet
(
cu
,
force
.
getNumComputedValues
(),
numParticles
,
"customGBComputedValues"
,
true
,
cu
.
getUseDoublePrecision
());
int
paddedNumParticles
=
cu
.
getPaddedNumAtoms
();
int
numParams
=
force
.
getNumPerParticleParameters
();
params
=
new
CudaParameterSet
(
cu
,
force
.
getNumPerParticleParameters
(),
paddedNumParticles
,
"customGBParameters"
,
true
);
computedValues
=
new
CudaParameterSet
(
cu
,
force
.
getNumComputedValues
(),
paddedNumParticles
,
"customGBComputedValues"
,
true
,
cu
.
getUseDoublePrecision
());
if
(
force
.
getNumGlobalParameters
()
>
0
)
globals
=
CudaArray
::
create
<
float
>
(
cu
,
force
.
getNumGlobalParameters
(),
"customGBGlobals"
);
vector
<
vector
<
float
>
>
paramVector
(
n
umParticles
);
vector
<
vector
<
float
>
>
paramVector
(
paddedN
umParticles
,
vector
<
float
>
(
numParams
,
0
)
);
vector
<
vector
<
int
>
>
exclusionList
(
numParticles
);
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
{
vector
<
double
>
parameters
;
force
.
getParticleParameters
(
i
,
parameters
);
paramVector
[
i
].
resize
(
parameters
.
size
());
for
(
int
j
=
0
;
j
<
(
int
)
parameters
.
size
();
j
++
)
paramVector
[
i
][
j
]
=
(
float
)
parameters
[
j
];
exclusionList
[
i
].
push_back
(
i
);
...
...
@@ -2406,23 +2404,22 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
replacements
[
"LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"
]
=
loadLocal2
.
str
();
replacements
[
"LOAD_ATOM1_PARAMETERS"
]
=
load1
.
str
();
replacements
[
"LOAD_ATOM2_PARAMETERS"
]
=
load2
.
str
();
map
<
string
,
string
>
defines
;
if
(
useCutoff
)
d
efines
[
"USE_CUTOFF"
]
=
"1"
;
pairValueD
efines
[
"USE_CUTOFF"
]
=
"1"
;
if
(
usePeriodic
)
d
efines
[
"USE_PERIODIC"
]
=
"1"
;
pairValueD
efines
[
"USE_PERIODIC"
]
=
"1"
;
if
(
useExclusionsForValue
)
d
efines
[
"USE_EXCLUSIONS"
]
=
"1"
;
pairValueD
efines
[
"USE_EXCLUSIONS"
]
=
"1"
;
if
(
atomParamSize
%
2
==
0
&&
!
cu
.
getUseDoublePrecision
())
d
efines
[
"NEED_PADDING"
]
=
"1"
;
d
efines
[
"WARPS_PER_GROUP"
]
=
cu
.
intToString
(
cu
.
getNonbondedUtilities
().
getForceThreadBlockSize
()
/
CudaContext
::
TileSize
);
d
efines
[
"THREAD_BLOCK_SIZE"
]
=
cu
.
intToString
(
cu
.
getNonbondedUtilities
().
getForceThreadBlockSize
());
d
efines
[
"CUTOFF_SQUARED"
]
=
cu
.
doubleToString
(
force
.
getCutoffDistance
()
*
force
.
getCutoffDistance
());
d
efines
[
"NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getNumAtoms
());
d
efines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
d
efines
[
"NUM_BLOCKS"
]
=
cu
.
intToString
(
cu
.
getNumAtomBlocks
());
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
cu
.
replace
String
s
(
Cuda
KernelSources
::
customGBValueN2
,
replacements
),
defines
);
pairValue
Kernel
=
cu
.
getKernel
(
module
,
"computeN2Value"
);
pairValueD
efines
[
"NEED_PADDING"
]
=
"1"
;
pairValueD
efines
[
"WARPS_PER_GROUP"
]
=
cu
.
intToString
(
cu
.
getNonbondedUtilities
().
getForceThreadBlockSize
()
/
CudaContext
::
TileSize
);
pairValueD
efines
[
"THREAD_BLOCK_SIZE"
]
=
cu
.
intToString
(
cu
.
getNonbondedUtilities
().
getForceThreadBlockSize
());
pairValueD
efines
[
"CUTOFF_SQUARED"
]
=
cu
.
doubleToString
(
force
.
getCutoffDistance
()
*
force
.
getCutoffDistance
());
pairValueD
efines
[
"NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getNumAtoms
());
pairValueD
efines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
pairValueD
efines
[
"NUM_BLOCKS"
]
=
cu
.
intToString
(
cu
.
getNumAtomBlocks
());
pairValueDefines
[
"TILE_SIZE"
]
=
cu
.
intTo
String
(
Cuda
Context
::
TileSize
);
pairValue
Src
=
cu
.
replaceStrings
(
CudaKernelSources
::
customGBValueN2
,
replacements
);
if
(
useExclusionsForValue
)
cu
.
getNonbondedUtilities
().
requestExclusions
(
exclusionList
);
}
...
...
@@ -2574,23 +2571,22 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
replacements
[
"RECORD_DERIVATIVE_2"
]
=
recordDeriv
.
str
();
replacements
[
"STORE_DERIVATIVES_1"
]
=
storeDerivs1
.
str
();
replacements
[
"STORE_DERIVATIVES_2"
]
=
storeDerivs2
.
str
();
map
<
string
,
string
>
defines
;
if
(
useCutoff
)
d
efines
[
"USE_CUTOFF"
]
=
"1"
;
pairEnergyD
efines
[
"USE_CUTOFF"
]
=
"1"
;
if
(
usePeriodic
)
d
efines
[
"USE_PERIODIC"
]
=
"1"
;
pairEnergyD
efines
[
"USE_PERIODIC"
]
=
"1"
;
if
(
anyExclusions
)
d
efines
[
"USE_EXCLUSIONS"
]
=
"1"
;
pairEnergyD
efines
[
"USE_EXCLUSIONS"
]
=
"1"
;
if
(
atomParamSize
%
2
==
0
&&
!
cu
.
getUseDoublePrecision
())
d
efines
[
"NEED_PADDING"
]
=
"1"
;
d
efines
[
"THREAD_BLOCK_SIZE"
]
=
cu
.
intToString
(
cu
.
getNonbondedUtilities
().
getForceThreadBlockSize
());
d
efines
[
"WARPS_PER_GROUP"
]
=
cu
.
intToString
(
cu
.
getNonbondedUtilities
().
getForceThreadBlockSize
()
/
CudaContext
::
TileSize
);
d
efines
[
"CUTOFF_SQUARED"
]
=
cu
.
doubleToString
(
force
.
getCutoffDistance
()
*
force
.
getCutoffDistance
());
d
efines
[
"NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getNumAtoms
());
d
efines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
d
efines
[
"NUM_BLOCKS"
]
=
cu
.
intToString
(
cu
.
getNumAtomBlocks
());
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
cu
.
replace
String
s
(
Cuda
KernelSources
::
customGBEnergyN2
,
replacements
),
defines
);
pairEnergy
Kernel
=
cu
.
getKernel
(
module
,
"computeN2Energy"
);
pairEnergyD
efines
[
"NEED_PADDING"
]
=
"1"
;
pairEnergyD
efines
[
"THREAD_BLOCK_SIZE"
]
=
cu
.
intToString
(
cu
.
getNonbondedUtilities
().
getForceThreadBlockSize
());
pairEnergyD
efines
[
"WARPS_PER_GROUP"
]
=
cu
.
intToString
(
cu
.
getNonbondedUtilities
().
getForceThreadBlockSize
()
/
CudaContext
::
TileSize
);
pairEnergyD
efines
[
"CUTOFF_SQUARED"
]
=
cu
.
doubleToString
(
force
.
getCutoffDistance
()
*
force
.
getCutoffDistance
());
pairEnergyD
efines
[
"NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getNumAtoms
());
pairEnergyD
efines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
pairEnergyD
efines
[
"NUM_BLOCKS"
]
=
cu
.
intToString
(
cu
.
getNumAtomBlocks
());
pairEnergyDefines
[
"TILE_SIZE"
]
=
cu
.
intTo
String
(
Cuda
Context
::
TileSize
);
pairEnergy
Src
=
cu
.
replaceStrings
(
CudaKernelSources
::
customGBEnergyN2
,
replacements
);
}
{
// Create the kernel to reduce the derivatives and calculate per-particle energy terms.
...
...
@@ -2834,14 +2830,46 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
CudaNonbondedUtilities
&
nb
=
cu
.
getNonbondedUtilities
();
if
(
!
hasInitializedKernels
)
{
hasInitializedKernels
=
true
;
// These two kernels can't be compiled in initialize(), because the nonbonded utilities object
// has not yet been initialized then.
{
int
numExclusionTiles
=
cu
.
getNonbondedUtilities
().
getExclusionTiles
().
getSize
();
pairValueDefines
[
"NUM_TILES_WITH_EXCLUSIONS"
]
=
cu
.
intToString
(
numExclusionTiles
);
int
numContexts
=
cu
.
getPlatformData
().
contexts
.
size
();
int
startExclusionIndex
=
cu
.
getContextIndex
()
*
numExclusionTiles
/
numContexts
;
int
endExclusionIndex
=
(
cu
.
getContextIndex
()
+
1
)
*
numExclusionTiles
/
numContexts
;
pairValueDefines
[
"FIRST_EXCLUSION_TILE"
]
=
cu
.
intToString
(
startExclusionIndex
);
pairValueDefines
[
"LAST_EXCLUSION_TILE"
]
=
cu
.
intToString
(
endExclusionIndex
);
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
pairValueSrc
,
pairValueDefines
);
pairValueKernel
=
cu
.
getKernel
(
module
,
"computeN2Value"
);
pairValueSrc
=
""
;
pairValueDefines
.
clear
();
}
{
int
numExclusionTiles
=
cu
.
getNonbondedUtilities
().
getExclusionTiles
().
getSize
();
pairEnergyDefines
[
"NUM_TILES_WITH_EXCLUSIONS"
]
=
cu
.
intToString
(
numExclusionTiles
);
int
numContexts
=
cu
.
getPlatformData
().
contexts
.
size
();
int
startExclusionIndex
=
cu
.
getContextIndex
()
*
numExclusionTiles
/
numContexts
;
int
endExclusionIndex
=
(
cu
.
getContextIndex
()
+
1
)
*
numExclusionTiles
/
numContexts
;
pairEnergyDefines
[
"FIRST_EXCLUSION_TILE"
]
=
cu
.
intToString
(
startExclusionIndex
);
pairEnergyDefines
[
"LAST_EXCLUSION_TILE"
]
=
cu
.
intToString
(
endExclusionIndex
);
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
pairEnergySrc
,
pairEnergyDefines
);
pairEnergyKernel
=
cu
.
getKernel
(
module
,
"computeN2Energy"
);
pairEnergySrc
=
""
;
pairEnergyDefines
.
clear
();
}
// Set arguments for kernels.
maxTiles
=
(
nb
.
getUseCutoff
()
?
nb
.
getInteractingTiles
().
getSize
()
:
cu
.
getNumAtomBlocks
()
*
(
cu
.
getNumAtomBlocks
()
+
1
)
/
2
);
valueBuffers
=
CudaArray
::
create
<
long
long
>
(
cu
,
cu
.
getPaddedNumAtoms
(),
"customGBValueBuffers"
);
cu
.
addAutoclearBuffer
(
*
valueBuffers
);
cu
.
clearBuffer
(
valueBuffers
->
getDevicePointer
(),
sizeof
(
long
long
)
*
valueBuffers
->
getSize
());
pairValueArgs
.
push_back
(
&
cu
.
getPosq
().
getDevicePointer
());
pairValueArgs
.
push_back
(
&
cu
.
getNonbondedUtilities
().
getExclusions
().
getDevicePointer
());
pairValueArgs
.
push_back
(
&
cu
.
getNonbondedUtilities
().
getExclusionIndices
().
getDevicePointer
());
pairValueArgs
.
push_back
(
&
cu
.
getNonbondedUtilities
().
getExclusionRowIndices
().
getDevicePointer
());
pairValueArgs
.
push_back
(
&
cu
.
getNonbondedUtilities
().
getExclusionTiles
().
getDevicePointer
());
pairValueArgs
.
push_back
(
&
valueBuffers
->
getDevicePointer
());
if
(
nb
.
getUseCutoff
())
{
pairValueArgs
.
push_back
(
&
nb
.
getInteractingTiles
().
getDevicePointer
());
...
...
@@ -2849,7 +2877,8 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
pairValueArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
pairValueArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
pairValueArgs
.
push_back
(
&
maxTiles
);
pairValueArgs
.
push_back
(
&
nb
.
getInteractionFlags
().
getDevicePointer
());
pairValueArgs
.
push_back
(
&
nb
.
getBlockCenters
().
getDevicePointer
());
pairValueArgs
.
push_back
(
&
nb
.
getInteractingAtoms
().
getDevicePointer
());
}
else
pairValueArgs
.
push_back
(
&
maxTiles
);
...
...
@@ -2881,15 +2910,15 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
pairEnergyArgs
.
push_back
(
&
cu
.
getEnergyBuffer
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
cu
.
getPosq
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
cu
.
getNonbondedUtilities
().
getExclusions
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
cu
.
getNonbondedUtilities
().
getExclusionIndices
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
cu
.
getNonbondedUtilities
().
getExclusionRowIndices
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
cu
.
getNonbondedUtilities
().
getExclusionTiles
().
getDevicePointer
());
if
(
nb
.
getUseCutoff
())
{
pairEnergyArgs
.
push_back
(
&
nb
.
getInteractingTiles
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
nb
.
getInteractionCount
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
pairEnergyArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
pairEnergyArgs
.
push_back
(
&
maxTiles
);
pairEnergyArgs
.
push_back
(
&
nb
.
getInteractionFlags
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
nb
.
getBlockCenters
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
nb
.
getInteractingAtoms
().
getDevicePointer
());
}
else
pairEnergyArgs
.
push_back
(
&
maxTiles
);
...
...
@@ -2953,10 +2982,10 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
if
(
nb
.
getUseCutoff
())
{
if
(
maxTiles
<
nb
.
getInteractingTiles
().
getSize
())
{
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
pairValueArgs
[
5
]
=
&
nb
.
getInteractingTiles
().
getDevicePointer
();
pairEnergyArgs
[
6
]
=
&
nb
.
getInteractingTiles
().
getDevicePointer
();
pairValueArgs
[
10
]
=
&
nb
.
getInteracti
onFlag
s
().
getDevicePointer
();
pairEnergyArgs
[
11
]
=
&
nb
.
getInteracti
onFlag
s
().
getDevicePointer
();
pairValueArgs
[
4
]
=
&
nb
.
getInteractingTiles
().
getDevicePointer
();
pairEnergyArgs
[
5
]
=
&
nb
.
getInteractingTiles
().
getDevicePointer
();
pairValueArgs
[
10
]
=
&
nb
.
getInteracti
ngAtom
s
().
getDevicePointer
();
pairEnergyArgs
[
11
]
=
&
nb
.
getInteracti
ngAtom
s
().
getDevicePointer
();
}
}
cu
.
executeKernel
(
pairValueKernel
,
&
pairValueArgs
[
0
],
nb
.
getNumForceThreadBlocks
()
*
nb
.
getForceThreadBlockSize
(),
nb
.
getForceThreadBlockSize
());
...
...
@@ -2976,11 +3005,10 @@ void CudaCalcCustomGBForceKernel::copyParametersToContext(ContextImpl& context,
// Record the per-particle parameters.
vector
<
vector
<
float
>
>
paramVector
(
numParticles
);
vector
<
vector
<
float
>
>
paramVector
(
cu
.
getPaddedNumAtoms
(),
vector
<
float
>
(
force
.
getNumPerParticleParameters
(),
0
)
);
vector
<
double
>
parameters
;
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
{
force
.
getParticleParameters
(
i
,
parameters
);
paramVector
[
i
].
resize
(
parameters
.
size
());
for
(
int
j
=
0
;
j
<
(
int
)
parameters
.
size
();
j
++
)
paramVector
[
i
][
j
]
=
(
float
)
parameters
[
j
];
}
...
...
platforms/cuda/src/CudaKernels.h
View file @
93c467b2
...
...
@@ -557,8 +557,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
CudaCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CudaContext
&
cu
,
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
cu
(
cu
),
hasInitializedFFT
(
false
),
sigmaEpsilon
(
NULL
),
exceptionParams
(
NULL
),
cosSinSums
(
NULL
),
directPmeGrid
(
NULL
),
reciprocalPmeGrid
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeBsplineTheta
(
NULL
),
pmeBsplineDTheta
(
NULL
),
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
sort
(
NULL
)
{
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
sort
(
NULL
)
{
}
~
CudaCalcNonbondedForceKernel
();
/**
...
...
@@ -607,8 +606,6 @@ private:
CudaArray
*
pmeBsplineModuliX
;
CudaArray
*
pmeBsplineModuliY
;
CudaArray
*
pmeBsplineModuliZ
;
CudaArray
*
pmeBsplineTheta
;
CudaArray
*
pmeBsplineDTheta
;
CudaArray
*
pmeAtomRange
;
CudaArray
*
pmeAtomGridIndex
;
CudaSort
*
sort
;
...
...
@@ -617,9 +614,6 @@ private:
CUfunction
ewaldSumsKernel
;
CUfunction
ewaldForcesKernel
;
CUfunction
pmeGridIndexKernel
;
CUfunction
pmeAtomRangeKernel
;
CUfunction
pmeZIndexKernel
;
CUfunction
pmeUpdateBsplinesKernel
;
CUfunction
pmeSpreadChargeKernel
;
CUfunction
pmeFinishSpreadChargeKernel
;
CUfunction
pmeEvalEnergyKernel
;
...
...
@@ -776,6 +770,8 @@ private:
System
&
system
;
CUfunction
pairValueKernel
,
perParticleValueKernel
,
pairEnergyKernel
,
perParticleEnergyKernel
,
gradientChainRuleKernel
;
std
::
vector
<
void
*>
pairValueArgs
,
perParticleValueArgs
,
pairEnergyArgs
,
perParticleEnergyArgs
,
gradientChainRuleArgs
;
std
::
string
pairValueSrc
,
pairEnergySrc
;
std
::
map
<
std
::
string
,
std
::
string
>
pairValueDefines
,
pairEnergyDefines
;
};
/**
...
...
platforms/cuda/src/CudaNonbondedUtilities.cpp
View file @
93c467b2
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -29,6 +29,8 @@
#include "CudaArray.h"
#include "CudaKernelSources.h"
#include "CudaExpressionUtilities.h"
#include "CudaSort.h"
#include <algorithm>
#include <map>
#include <set>
#include <utility>
...
...
@@ -43,15 +45,33 @@ using namespace std;
throw OpenMMException(m.str());\
}
CudaNonbondedUtilities
::
CudaNonbondedUtilities
(
CudaContext
&
context
)
:
context
(
context
),
cutoff
(
-
1.0
),
useCutoff
(
false
),
anyExclusions
(
false
),
exclusionIndices
(
NULL
),
exclusionRowIndices
(
NULL
),
exclusions
(
NULL
),
interactingTiles
(
NULL
),
interactionFlags
(
NULL
),
interactionCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
),
nonbondedForceGroup
(
0
)
{
class
CudaNonbondedUtilities
::
BlockSortTrait
:
public
CudaSort
::
SortTrait
{
public:
BlockSortTrait
(
bool
useDouble
)
:
useDouble
(
useDouble
)
{
}
int
getDataSize
()
const
{
return
useDouble
?
sizeof
(
double2
)
:
sizeof
(
float2
);}
int
getKeySize
()
const
{
return
useDouble
?
sizeof
(
double
)
:
sizeof
(
float
);}
const
char
*
getDataType
()
const
{
return
"real2"
;}
const
char
*
getKeyType
()
const
{
return
"real"
;}
const
char
*
getMinKey
()
const
{
return
"-3.40282e+38f"
;}
const
char
*
getMaxKey
()
const
{
return
"3.40282e+38f"
;}
const
char
*
getMaxValue
()
const
{
return
"make_real2(3.40282e+38f, 3.40282e+38f)"
;}
const
char
*
getSortKey
()
const
{
return
"value.x"
;}
private:
bool
useDouble
;
};
CudaNonbondedUtilities
::
CudaNonbondedUtilities
(
CudaContext
&
context
)
:
context
(
context
),
cutoff
(
-
1.0
),
useCutoff
(
false
),
anyExclusions
(
false
),
usePadding
(
true
),
exclusionIndices
(
NULL
),
exclusionRowIndices
(
NULL
),
exclusionTiles
(
NULL
),
exclusions
(
NULL
),
interactingTiles
(
NULL
),
interactingAtoms
(
NULL
),
interactionCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
),
sortedBlocks
(
NULL
),
sortedBlockCenter
(
NULL
),
sortedBlockBoundingBox
(
NULL
),
oldPositions
(
NULL
),
rebuildNeighborList
(
NULL
),
blockSorter
(
NULL
),
nonbondedForceGroup
(
0
)
{
// Decide how many thread blocks to use.
string
errorMessage
=
"Error initializing nonbonded utilities"
;
int
multiprocessors
;
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
context
.
getDevice
()));
numForceThreadBlocks
=
3
*
multiprocessors
;
numForceThreadBlocks
=
4
*
multiprocessors
;
forceThreadBlockSize
=
(
context
.
getComputeCapability
()
<
2.0
?
128
:
256
);
}
...
...
@@ -60,18 +80,32 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
delete
exclusionIndices
;
if
(
exclusionRowIndices
!=
NULL
)
delete
exclusionRowIndices
;
if
(
exclusionTiles
!=
NULL
)
delete
exclusionTiles
;
if
(
exclusions
!=
NULL
)
delete
exclusions
;
if
(
interactingTiles
!=
NULL
)
delete
interactingTiles
;
if
(
interacti
onFlag
s
!=
NULL
)
delete
interacti
onFlag
s
;
if
(
interacti
ngAtom
s
!=
NULL
)
delete
interacti
ngAtom
s
;
if
(
interactionCount
!=
NULL
)
delete
interactionCount
;
if
(
blockCenter
!=
NULL
)
delete
blockCenter
;
if
(
blockBoundingBox
!=
NULL
)
delete
blockBoundingBox
;
if
(
sortedBlocks
!=
NULL
)
delete
sortedBlocks
;
if
(
sortedBlockCenter
!=
NULL
)
delete
sortedBlockCenter
;
if
(
sortedBlockBoundingBox
!=
NULL
)
delete
sortedBlockBoundingBox
;
if
(
oldPositions
!=
NULL
)
delete
oldPositions
;
if
(
rebuildNeighborList
!=
NULL
)
delete
rebuildNeighborList
;
if
(
blockSorter
!=
NULL
)
delete
blockSorter
;
}
void
CudaNonbondedUtilities
::
addInteraction
(
bool
usesCutoff
,
bool
usesPeriodic
,
bool
usesExclusions
,
double
cutoffDistance
,
const
vector
<
vector
<
int
>
>&
exclusionList
,
const
string
&
kernel
,
int
forceGroup
)
{
...
...
@@ -124,6 +158,10 @@ void CudaNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclu
}
}
static
bool
compareUshort2
(
ushort2
a
,
ushort2
b
)
{
return
((
a
.
y
<
b
.
y
)
||
(
a
.
y
==
b
.
y
&&
a
.
x
<
b
.
x
));
}
void
CudaNonbondedUtilities
::
initialize
(
const
System
&
system
)
{
string
errorMessage
=
"Error initializing nonbonded utilities"
;
if
(
atomExclusions
.
size
()
==
0
)
{
...
...
@@ -138,13 +176,10 @@ void CudaNonbondedUtilities::initialize(const System& system) {
numAtoms
=
context
.
getNumAtoms
();
int
numAtomBlocks
=
context
.
getNumAtomBlocks
();
int
totalTiles
=
numAtomBlocks
*
(
numAtomBlocks
+
1
)
/
2
;
int
numContexts
=
context
.
getPlatformData
().
contexts
.
size
();
startTileIndex
=
context
.
getContextIndex
()
*
totalTiles
/
numContexts
;
int
endTileIndex
=
(
context
.
getContextIndex
()
+
1
)
*
totalTiles
/
numContexts
;
numTiles
=
endTileIndex
-
startTileIndex
;
setAtomBlockRange
(
context
.
getContextIndex
()
/
(
double
)
numContexts
,
(
context
.
getContextIndex
()
+
1
)
/
(
double
)
numContexts
);
// Build a list of
indices for the tiles with
exclusions.
// Build a list of
tiles that contain
exclusions.
set
<
pair
<
int
,
int
>
>
tilesWithExclusions
;
for
(
int
atom1
=
0
;
atom1
<
(
int
)
atomExclusions
.
size
();
++
atom1
)
{
...
...
@@ -155,19 +190,29 @@ void CudaNonbondedUtilities::initialize(const System& system) {
tilesWithExclusions
.
insert
(
make_pair
(
max
(
x
,
y
),
min
(
x
,
y
)));
}
}
if
(
context
.
getPaddedNumAtoms
()
>
context
.
getNumAtoms
())
{
for
(
int
i
=
0
;
i
<
numAtomBlocks
;
++
i
)
tilesWithExclusions
.
insert
(
make_pair
(
numAtomBlocks
-
1
,
i
));
vector
<
ushort2
>
exclusionTilesVec
;
for
(
set
<
pair
<
int
,
int
>
>::
const_iterator
iter
=
tilesWithExclusions
.
begin
();
iter
!=
tilesWithExclusions
.
end
();
++
iter
)
exclusionTilesVec
.
push_back
(
make_ushort2
((
unsigned
short
)
iter
->
first
,
(
unsigned
short
)
iter
->
second
));
sort
(
exclusionTilesVec
.
begin
(),
exclusionTilesVec
.
end
(),
compareUshort2
);
exclusionTiles
=
CudaArray
::
create
<
ushort2
>
(
context
,
exclusionTilesVec
.
size
(),
"exclusionTiles"
);
exclusionTiles
->
upload
(
exclusionTilesVec
);
map
<
pair
<
int
,
int
>
,
int
>
exclusionTileMap
;
for
(
int
i
=
0
;
i
<
(
int
)
exclusionTilesVec
.
size
();
i
++
)
{
ushort2
tile
=
exclusionTilesVec
[
i
];
exclusionTileMap
[
make_pair
(
tile
.
x
,
tile
.
y
)]
=
i
;
}
vector
<
vector
<
int
>
>
exclusionBlocksForBlock
(
numAtomBlocks
);
for
(
set
<
pair
<
int
,
int
>
>::
const_iterator
iter
=
tilesWithExclusions
.
begin
();
iter
!=
tilesWithExclusions
.
end
();
++
iter
)
{
exclusionBlocksForBlock
[
iter
->
first
].
push_back
(
iter
->
second
);
if
(
iter
->
first
!=
iter
->
second
)
exclusionBlocksForBlock
[
iter
->
second
].
push_back
(
iter
->
first
);
}
vector
<
unsigned
int
>
exclusionRowIndicesVec
(
numAtomBlocks
+
1
,
0
);
vector
<
unsigned
int
>
exclusionIndicesVec
;
int
currentRow
=
0
;
for
(
set
<
pair
<
int
,
int
>
>::
const_iterator
iter
=
tilesWithExclusions
.
begin
();
iter
!=
tilesWithExclusions
.
end
();
++
iter
)
{
while
(
iter
->
first
!=
currentRow
)
exclusionRowIndicesVec
[
++
currentRow
]
=
exclusionIndicesVec
.
size
();
exclusionIndicesVec
.
push_back
(
iter
->
second
);
for
(
int
i
=
0
;
i
<
numAtomBlocks
;
i
++
)
{
exclusionIndicesVec
.
insert
(
exclusionIndicesVec
.
end
(),
exclusionBlocksForBlock
[
i
].
begin
(),
exclusionBlocksForBlock
[
i
].
end
());
exclusionRowIndicesVec
[
i
+
1
]
=
exclusionIndicesVec
.
size
();
}
exclusionRowIndicesVec
[
++
currentRow
]
=
exclusionIndicesVec
.
size
();
exclusionIndices
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
exclusionIndicesVec
.
size
(),
"exclusionIndices"
);
exclusionRowIndices
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
exclusionRowIndicesVec
.
size
(),
"exclusionRowIndices"
);
exclusionIndices
->
upload
(
exclusionIndicesVec
);
...
...
@@ -175,8 +220,9 @@ void CudaNonbondedUtilities::initialize(const System& system) {
// Record the exclusion data.
exclusions
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
tilesWithExclusions
.
size
()
*
CudaContext
::
TileSize
,
"exclusions"
);
vector
<
unsigned
int
>
exclusionVec
(
exclusions
->
getSize
(),
0xFFFFFFFF
);
exclusions
=
CudaArray
::
create
<
tileflags
>
(
context
,
tilesWithExclusions
.
size
()
*
CudaContext
::
TileSize
,
"exclusions"
);
tileflags
allFlags
=
(
tileflags
)
-
1
;
vector
<
tileflags
>
exclusionVec
(
exclusions
->
getSize
(),
allFlags
);
for
(
int
atom1
=
0
;
atom1
<
(
int
)
atomExclusions
.
size
();
++
atom1
)
{
int
x
=
atom1
/
CudaContext
::
TileSize
;
int
offset1
=
atom1
-
x
*
CudaContext
::
TileSize
;
...
...
@@ -185,31 +231,12 @@ void CudaNonbondedUtilities::initialize(const System& system) {
int
y
=
atom2
/
CudaContext
::
TileSize
;
int
offset2
=
atom2
-
y
*
CudaContext
::
TileSize
;
if
(
x
>
y
)
{
int
index
=
findE
xclusion
Index
(
x
,
y
,
exclusionIndicesVec
,
exclusionRowIndicesVec
)
;
exclusionVec
[
index
+
offset1
]
&=
0xFFFFFFFF
-
(
1
<<
offset2
);
int
index
=
e
xclusion
TileMap
[
make_pair
(
x
,
y
)]
*
CudaContext
::
TileSize
;
exclusionVec
[
index
+
offset1
]
&=
allFlags
-
(
1
<<
offset2
);
}
else
{
int
index
=
findExclusionIndex
(
y
,
x
,
exclusionIndicesVec
,
exclusionRowIndicesVec
);
exclusionVec
[
index
+
offset2
]
&=
0xFFFFFFFF
-
(
1
<<
offset1
);
}
}
}
// Mark all interactions that involve a padding atom as being excluded.
for
(
int
atom1
=
context
.
getNumAtoms
();
atom1
<
context
.
getPaddedNumAtoms
();
++
atom1
)
{
int
x
=
atom1
/
CudaContext
::
TileSize
;
int
offset1
=
atom1
-
x
*
CudaContext
::
TileSize
;
for
(
int
atom2
=
0
;
atom2
<
context
.
getPaddedNumAtoms
();
++
atom2
)
{
int
y
=
atom2
/
CudaContext
::
TileSize
;
int
offset2
=
atom2
-
y
*
CudaContext
::
TileSize
;
if
(
x
>=
y
)
{
int
index
=
findExclusionIndex
(
x
,
y
,
exclusionIndicesVec
,
exclusionRowIndicesVec
);
exclusionVec
[
index
+
offset1
]
&=
0xFFFFFFFF
-
(
1
<<
offset2
);
}
if
(
y
>=
x
)
{
int
index
=
findExclusionIndex
(
y
,
x
,
exclusionIndicesVec
,
exclusionRowIndicesVec
);
exclusionVec
[
index
+
offset2
]
&=
0xFFFFFFFF
-
(
1
<<
offset1
);
int
index
=
exclusionTileMap
[
make_pair
(
y
,
x
)]
*
CudaContext
::
TileSize
;
exclusionVec
[
index
+
offset2
]
&=
allFlags
-
(
1
<<
offset1
);
}
}
}
...
...
@@ -219,26 +246,34 @@ void CudaNonbondedUtilities::initialize(const System& system) {
// Create data structures for the neighbor list.
if
(
useCutoff
)
{
// Select a size for the arrays that hold the neighbor list.
This esti
ma
t
e
is intentionally ver
y
//
high, because if it ever is
too small
,
we
have to fall back to the N^2 algorithm
.
// Select a size for the arrays that hold the neighbor list.
We have to
ma
k
e
a fairl
y
//
arbitrary guess, but if this turns out to be
too small we
'll increase it later
.
double4
boxSize
=
context
.
getPeriodicBoxSize
();
maxTiles
=
(
int
)
(
numTiles
*
(
cutoff
/
boxSize
.
x
+
cutoff
/
boxSize
.
y
+
cutoff
/
boxSize
.
z
));
maxTiles
=
20
*
numAtomBlocks
;
if
(
maxTiles
>
numTiles
)
maxTiles
=
numTiles
;
if
(
maxTiles
<
1
)
maxTiles
=
1
;
interactingTiles
=
CudaArray
::
create
<
ushort2
>
(
context
,
maxTiles
,
"interactingTiles"
);
interacti
onFlag
s
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
maxTiles
,
"interacti
onFlag
s"
);
interacti
ngAtom
s
=
CudaArray
::
create
<
int
>
(
context
,
CudaContext
::
TileSize
*
maxTiles
,
"interacti
ngAtom
s"
);
interactionCount
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
1
,
"interactionCount"
);
int
elementSize
=
(
context
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
blockCenter
=
new
CudaArray
(
context
,
numAtomBlocks
,
4
*
elementSize
,
"blockCenter"
);
blockBoundingBox
=
new
CudaArray
(
context
,
numAtomBlocks
,
4
*
elementSize
,
"blockBoundingBox"
);
sortedBlocks
=
new
CudaArray
(
context
,
numAtomBlocks
,
2
*
elementSize
,
"sortedBlocks"
);
sortedBlockCenter
=
new
CudaArray
(
context
,
numAtomBlocks
+
1
,
4
*
elementSize
,
"sortedBlockCenter"
);
sortedBlockBoundingBox
=
new
CudaArray
(
context
,
numAtomBlocks
+
1
,
4
*
elementSize
,
"sortedBlockBoundingBox"
);
oldPositions
=
new
CudaArray
(
context
,
numAtoms
,
4
*
elementSize
,
"oldPositions"
);
if
(
context
.
getUseDoublePrecision
())
{
blockCenter
=
CudaArray
::
create
<
double4
>
(
context
,
numAtomBlocks
,
"blockCenter"
);
blockBoundingBox
=
CudaArray
::
create
<
double4
>
(
context
,
numAtomBlocks
,
"blockBoundingBox"
);
vector
<
double4
>
oldPositionsVec
(
numAtoms
,
make_double4
(
1e30
,
1e30
,
1e30
,
0
)
);
oldPositions
->
upload
(
oldPositionsVec
);
}
else
{
blockCenter
=
CudaArray
::
create
<
float4
>
(
context
,
numAtomBlocks
,
"blockCenter"
);
blockBoundingBox
=
CudaArray
::
create
<
float4
>
(
context
,
numAtomBlocks
,
"blockBoundingBox"
);
vector
<
float4
>
oldPositionsVec
(
numAtoms
,
make_float4
(
1e30
f
,
1e30
f
,
1e30
f
,
0
)
);
oldPositions
->
upload
(
oldPositionsVec
);
}
rebuildNeighborList
=
CudaArray
::
create
<
int
>
(
context
,
1
,
"rebuildNeighborList"
);
blockSorter
=
new
CudaSort
(
context
,
new
BlockSortTrait
(
context
.
getUseDoublePrecision
()),
numAtomBlocks
);
vector
<
unsigned
int
>
count
(
1
,
0
);
interactionCount
->
upload
(
count
);
}
...
...
@@ -248,11 +283,22 @@ void CudaNonbondedUtilities::initialize(const System& system) {
if
(
kernelSource
.
size
()
>
0
)
forceKernel
=
createInteractionKernel
(
kernelSource
,
parameters
,
arguments
,
true
,
true
);
if
(
useCutoff
)
{
double
padding
=
(
usePadding
?
0.1
*
cutoff
:
0.0
);
double
paddedCutoff
=
cutoff
+
padding
;
map
<
string
,
string
>
defines
;
defines
[
"TILE_SIZE"
]
=
context
.
intToString
(
CudaContext
::
TileSize
);
defines
[
"NUM_BLOCKS"
]
=
context
.
intToString
(
context
.
getNumAtomBlocks
());
defines
[
"CUTOFF_SQUARED"
]
=
context
.
doubleToString
(
cutoff
*
cutoff
);
defines
[
"NUM_ATOMS"
]
=
context
.
intToString
(
context
.
getNumAtoms
());
defines
[
"PADDING"
]
=
context
.
doubleToString
(
padding
);
defines
[
"PADDED_CUTOFF"
]
=
context
.
doubleToString
(
paddedCutoff
);
defines
[
"PADDED_CUTOFF_SQUARED"
]
=
context
.
doubleToString
(
paddedCutoff
*
paddedCutoff
);
defines
[
"NUM_TILES_WITH_EXCLUSIONS"
]
=
context
.
intToString
(
exclusionTiles
->
getSize
());
if
(
usePeriodic
)
defines
[
"USE_PERIODIC"
]
=
"1"
;
int
maxExclusions
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
exclusionBlocksForBlock
.
size
();
i
++
)
maxExclusions
=
(
maxExclusions
>
exclusionBlocksForBlock
[
i
].
size
()
?
maxExclusions
:
exclusionBlocksForBlock
[
i
].
size
());
defines
[
"MAX_EXCLUSIONS"
]
=
context
.
intToString
(
maxExclusions
);
CUmodule
interactingBlocksProgram
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
findInteractingBlocks
,
defines
);
findBlockBoundsKernel
=
context
.
getKernel
(
interactingBlocksProgram
,
"findBlockBounds"
);
findBlockBoundsArgs
.
push_back
(
&
numAtoms
);
...
...
@@ -261,7 +307,18 @@ void CudaNonbondedUtilities::initialize(const System& system) {
findBlockBoundsArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
findBlockBoundsArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
findBlockBoundsArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
findBlockBoundsArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
findBlockBoundsArgs
.
push_back
(
&
rebuildNeighborList
->
getDevicePointer
());
findBlockBoundsArgs
.
push_back
(
&
sortedBlocks
->
getDevicePointer
());
sortBoxDataKernel
=
context
.
getKernel
(
interactingBlocksProgram
,
"sortBoxData"
);
sortBoxDataArgs
.
push_back
(
&
sortedBlocks
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
sortedBlockCenter
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
sortedBlockBoundingBox
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
oldPositions
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
rebuildNeighborList
->
getDevicePointer
());
findInteractingBlocksKernel
=
context
.
getKernel
(
interactingBlocksProgram
,
"findBlocksWithInteractions"
);
findInteractingBlocksArgs
.
push_back
(
context
.
getPeriodicBoxSizePointer
());
findInteractingBlocksArgs
.
push_back
(
context
.
getInvPeriodicBoxSizePointer
());
...
...
@@ -269,35 +326,21 @@ void CudaNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactingTiles
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interacti
onFlag
s
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interacti
ngAtom
s
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
maxTiles
);
findInteractingBlocksArgs
.
push_back
(
&
startTileIndex
);
findInteractingBlocksArgs
.
push_back
(
&
numTiles
);
findInteractionsWithinBlocksKernel
=
context
.
getKernel
(
interactingBlocksProgram
,
"findInteractionsWithinBlocks"
);
findInteractionsWithinBlocksArgs
.
push_back
(
context
.
getPeriodicBoxSizePointer
());
findInteractionsWithinBlocksArgs
.
push_back
(
context
.
getInvPeriodicBoxSizePointer
());
findInteractionsWithinBlocksArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
findInteractionsWithinBlocksArgs
.
push_back
(
&
interactingTiles
->
getDevicePointer
());
findInteractionsWithinBlocksArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
findInteractionsWithinBlocksArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
findInteractionsWithinBlocksArgs
.
push_back
(
&
interactionFlags
->
getDevicePointer
());
findInteractionsWithinBlocksArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
findInteractionsWithinBlocksArgs
.
push_back
(
&
maxTiles
);
findInteractingBlocksArgs
.
push_back
(
&
startBlockIndex
);
findInteractingBlocksArgs
.
push_back
(
&
numBlocks
);
findInteractingBlocksArgs
.
push_back
(
&
sortedBlocks
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
sortedBlockCenter
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
sortedBlockBoundingBox
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
exclusionIndices
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
exclusionRowIndices
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
oldPositions
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
rebuildNeighborList
->
getDevicePointer
());
}
}
int
CudaNonbondedUtilities
::
findExclusionIndex
(
int
x
,
int
y
,
const
vector
<
unsigned
int
>&
exclusionIndices
,
const
vector
<
unsigned
int
>&
exclusionRowIndices
)
{
if
(
x
<
y
)
throw
OpenMMException
(
"Internal error: called findExclusionIndex with x<y"
);
int
start
=
exclusionRowIndices
[
x
];
int
end
=
exclusionRowIndices
[
x
+
1
];
for
(
int
i
=
start
;
i
<
end
;
i
++
)
if
(
exclusionIndices
[
i
]
==
y
)
return
i
*
CudaContext
::
TileSize
;
throw
OpenMMException
(
"Internal error: exclusion in unexpected tile"
);
}
void
CudaNonbondedUtilities
::
prepareInteractions
()
{
if
(
!
useCutoff
)
return
;
...
...
@@ -311,13 +354,17 @@ void CudaNonbondedUtilities::prepareInteractions() {
// Compute the neighbor list.
context
.
executeKernel
(
findBlockBoundsKernel
,
&
findBlockBoundsArgs
[
0
],
context
.
getNumAtoms
());
context
.
executeKernel
(
findInteractingBlocksKernel
,
&
findInteractingBlocksArgs
[
0
],
context
.
getNumAtoms
());
context
.
executeKernel
(
findInteractionsWithinBlocksKernel
,
&
findInteractionsWithinBlocksArgs
[
0
],
context
.
getNumAtoms
(),
128
);
blockSorter
->
sort
(
*
sortedBlocks
);
context
.
executeKernel
(
sortBoxDataKernel
,
&
sortBoxDataArgs
[
0
],
context
.
getNumAtoms
());
context
.
executeKernel
(
findInteractingBlocksKernel
,
&
findInteractingBlocksArgs
[
0
],
context
.
getNumAtoms
(),
256
);
}
void
CudaNonbondedUtilities
::
computeInteractions
()
{
if
(
kernelSource
.
size
()
>
0
)
if
(
kernelSource
.
size
()
>
0
)
{
context
.
executeKernel
(
forceKernel
,
&
forceArgs
[
0
],
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
if
(
context
.
getComputeForceCount
()
==
1
)
updateNeighborListSize
();
// This is the first time step, so check whether our initial guess was large enough.
}
}
void
CudaNonbondedUtilities
::
updateNeighborListSize
()
{
...
...
@@ -332,26 +379,42 @@ void CudaNonbondedUtilities::updateNeighborListSize() {
// this from happening in the future.
maxTiles
=
(
int
)
(
1.2
*
pinnedInteractionCount
[
0
]);
int
num
Tiles
=
context
.
getNumAtomBlocks
()
*
(
context
.
getNumAtomBlocks
()
+
1
)
/
2
;
if
(
maxTiles
>
num
Tiles
)
maxTiles
=
num
Tiles
;
int
total
Tiles
=
context
.
getNumAtomBlocks
()
*
(
context
.
getNumAtomBlocks
()
+
1
)
/
2
;
if
(
maxTiles
>
total
Tiles
)
maxTiles
=
total
Tiles
;
delete
interactingTiles
;
delete
interactingAtoms
;
interactingTiles
=
NULL
;
// Avoid an error in the destructor if the following allocation fails
interactingAtoms
=
NULL
;
interactingTiles
=
CudaArray
::
create
<
ushort2
>
(
context
,
maxTiles
,
"interactingTiles"
);
interactingAtoms
=
CudaArray
::
create
<
int
>
(
context
,
CudaContext
::
TileSize
*
maxTiles
,
"interactingAtoms"
);
if
(
forceArgs
.
size
()
>
0
)
forceArgs
[
8
]
=
&
interactingTiles
->
getDevicePointer
();
forceArgs
[
7
]
=
&
interactingTiles
->
getDevicePointer
();
findInteractingBlocksArgs
[
5
]
=
&
interactingTiles
->
getDevicePointer
();
delete
interactionFlags
;
interactionFlags
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
maxTiles
,
"interactionFlags"
);
if
(
forceArgs
.
size
()
>
0
)
forceArgs
[
13
]
=
&
interactionFlags
->
getDevicePointer
();
findInteractingBlocksArgs
[
6
]
=
&
interactionFlags
->
getDevicePointer
();
findInteractionsWithinBlocksArgs
[
3
]
=
&
interactingTiles
->
getDevicePointer
();
findInteractionsWithinBlocksArgs
[
6
]
=
&
interactionFlags
->
getDevicePointer
();
forceArgs
[
13
]
=
&
interactingAtoms
->
getDevicePointer
();
findInteractingBlocksArgs
[
6
]
=
&
interactingAtoms
->
getDevicePointer
();
if
(
context
.
getUseDoublePrecision
())
{
vector
<
double4
>
oldPositionsVec
(
numAtoms
,
make_double4
(
1e30
,
1e30
,
1e30
,
0
));
oldPositions
->
upload
(
oldPositionsVec
);
}
else
{
vector
<
float4
>
oldPositionsVec
(
numAtoms
,
make_float4
(
1e30
f
,
1e30
f
,
1e30
f
,
0
));
oldPositions
->
upload
(
oldPositionsVec
);
}
}
void
CudaNonbondedUtilities
::
setTileRange
(
int
startTileIndex
,
int
numTiles
)
{
this
->
startTileIndex
=
startTileIndex
;
this
->
numTiles
=
numTiles
;
void
CudaNonbondedUtilities
::
setUsePadding
(
bool
padding
)
{
usePadding
=
padding
;
}
void
CudaNonbondedUtilities
::
setAtomBlockRange
(
double
startFraction
,
double
endFraction
)
{
int
numAtomBlocks
=
context
.
getNumAtomBlocks
();
startBlockIndex
=
(
int
)
(
startFraction
*
numAtomBlocks
);
numBlocks
=
(
int
)
(
endFraction
*
numAtomBlocks
)
-
startBlockIndex
;
int
totalTiles
=
context
.
getNumAtomBlocks
()
*
(
context
.
getNumAtomBlocks
()
+
1
)
/
2
;
startTileIndex
=
(
int
)
(
startFraction
*
totalTiles
);;
numTiles
=
(
int
)
(
endFraction
*
totalTiles
)
-
startTileIndex
;
}
CUfunction
CudaNonbondedUtilities
::
createInteractionKernel
(
const
string
&
source
,
vector
<
ParameterInfo
>&
params
,
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
)
{
...
...
@@ -447,6 +510,14 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
defines
[
"NUM_ATOMS"
]
=
context
.
intToString
(
context
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
context
.
intToString
(
context
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
context
.
intToString
(
context
.
getNumAtomBlocks
());
defines
[
"TILE_SIZE"
]
=
context
.
intToString
(
CudaContext
::
TileSize
);
int
numExclusionTiles
=
exclusionTiles
->
getSize
();
defines
[
"NUM_TILES_WITH_EXCLUSIONS"
]
=
context
.
intToString
(
numExclusionTiles
);
int
numContexts
=
context
.
getPlatformData
().
contexts
.
size
();
int
startExclusionIndex
=
context
.
getContextIndex
()
*
numExclusionTiles
/
numContexts
;
int
endExclusionIndex
=
(
context
.
getContextIndex
()
+
1
)
*
numExclusionTiles
/
numContexts
;
defines
[
"FIRST_EXCLUSION_TILE"
]
=
context
.
intToString
(
startExclusionIndex
);
defines
[
"LAST_EXCLUSION_TILE"
]
=
context
.
intToString
(
endExclusionIndex
);
if
((
localDataSize
/
4
)
%
2
==
0
&&
!
context
.
getUseDoublePrecision
())
defines
[
"PARAMETER_SIZE_IS_EVEN"
]
=
"1"
;
if
(
context
.
getComputeCapability
()
>=
3.0
&&
!
context
.
getUseDoublePrecision
())
...
...
@@ -461,8 +532,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
forceArgs
.
push_back
(
&
context
.
getEnergyBuffer
().
getDevicePointer
());
forceArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
forceArgs
.
push_back
(
&
exclusions
->
getDevicePointer
());
forceArgs
.
push_back
(
&
exclusionIndices
->
getDevicePointer
());
forceArgs
.
push_back
(
&
exclusionRowIndices
->
getDevicePointer
());
forceArgs
.
push_back
(
&
exclusionTiles
->
getDevicePointer
());
forceArgs
.
push_back
(
&
startTileIndex
);
forceArgs
.
push_back
(
&
numTiles
);
if
(
useCutoff
)
{
...
...
@@ -471,7 +541,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
forceArgs
.
push_back
(
context
.
getPeriodicBoxSizePointer
());
forceArgs
.
push_back
(
context
.
getInvPeriodicBoxSizePointer
());
forceArgs
.
push_back
(
&
maxTiles
);
forceArgs
.
push_back
(
&
interactionFlags
->
getDevicePointer
());
forceArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
forceArgs
.
push_back
(
&
interactingAtoms
->
getDevicePointer
());
}
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
forceArgs
.
push_back
(
&
params
[
i
].
getMemory
());
...
...
platforms/cuda/src/CudaNonbondedUtilities.h
View file @
93c467b2
...
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -35,6 +35,8 @@
#include <vector>
namespace
OpenMM
{
class
CudaSort
;
/**
* This class provides a generic interface for calculating nonbonded interactions. It does this in two
...
...
@@ -181,10 +183,10 @@ public:
return
*
interactingTiles
;
}
/**
* Get the array containing
flags for
tile
s
with interactions.
* Get the array containing
the atoms in each
tile with interactions.
*/
CudaArray
&
getInteracti
onFlag
s
()
{
return
*
interacti
onFlag
s
;
CudaArray
&
getInteracti
ngAtom
s
()
{
return
*
interacti
ngAtom
s
;
}
/**
* Get the array containing exclusion flags.
...
...
@@ -192,6 +194,12 @@ public:
CudaArray
&
getExclusions
()
{
return
*
exclusions
;
}
/**
* Get the array containing tiles with exclusions.
*/
CudaArray
&
getExclusionTiles
()
{
return
*
exclusionTiles
;
}
/**
* Get the array containing the index into the exclusion array for each tile.
*/
...
...
@@ -217,9 +225,17 @@ public:
return
numTiles
;
}
/**
* Set the range of tiles that should be processed by this context.
* Set whether to add padding to the cutoff distance when building the neighbor list.
* This increases the size of the neighbor list (and thus the cost of computing interactions),
* but also means we don't need to rebuild it every time step. The default value is true,
* since usually this improves performance. For very expensive interactions, however,
* it may be better to set this to false.
*/
void
setUsePadding
(
bool
padding
);
/**
* Set the range of atom blocks and tiles that should be processed by this context.
*/
void
set
TileRange
(
int
startTileIndex
,
int
numTiles
);
void
set
AtomBlockRange
(
double
startFraction
,
double
endFraction
);
/**
* Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
* are assumed to be the same as those for the default interaction Kernel, since this kernel will use
...
...
@@ -232,42 +248,38 @@ public:
* @param isSymmetric specifies whether the interaction is symmetric
*/
CUfunction
createInteractionKernel
(
const
std
::
string
&
source
,
std
::
vector
<
ParameterInfo
>&
params
,
std
::
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
);
/**
* This is a utility routine for locating data in the exclusions array. It takes the (x,y) indices of a tile,
* and returns the location in the array where the data for that tile begins.
*
* This routine requires that x >= y. If not, it will throw an exception.
*
* @param x the x index of the tile
* @param y the y index of the tile
* @param exclusionIndices the content of the exclusionIndices array
* @param exclusionRowIndices the content of the exclusionRowIndices array
* @return the index in the exclusions array at which the data for that tile begins
*/
static
int
findExclusionIndex
(
int
x
,
int
y
,
const
std
::
vector
<
unsigned
int
>&
exclusionIndices
,
const
std
::
vector
<
unsigned
int
>&
exclusionRowIndices
);
private:
class
BlockSortTrait
;
CudaContext
&
context
;
CUfunction
forceKernel
;
CUfunction
findBlockBoundsKernel
;
CUfunction
sortBoxDataKernel
;
CUfunction
findInteractingBlocksKernel
;
CUfunction
findInteractionsWithinBlocksKernel
;
CudaArray
*
exclusionTiles
;
CudaArray
*
exclusions
;
CudaArray
*
exclusionIndices
;
CudaArray
*
exclusionRowIndices
;
CudaArray
*
interactingTiles
;
CudaArray
*
interacti
onFlag
s
;
CudaArray
*
interacti
ngAtom
s
;
CudaArray
*
interactionCount
;
CudaArray
*
blockCenter
;
CudaArray
*
blockBoundingBox
;
std
::
vector
<
void
*>
forceArgs
,
findBlockBoundsArgs
,
findInteractingBlocksArgs
,
findInteractionsWithinBlocksArgs
;
CudaArray
*
sortedBlocks
;
CudaArray
*
sortedBlockCenter
;
CudaArray
*
sortedBlockBoundingBox
;
CudaArray
*
oldPositions
;
CudaArray
*
rebuildNeighborList
;
CudaSort
*
blockSorter
;
std
::
vector
<
void
*>
forceArgs
,
findBlockBoundsArgs
,
sortBoxDataArgs
,
findInteractingBlocksArgs
;
std
::
vector
<
std
::
vector
<
int
>
>
atomExclusions
;
std
::
vector
<
ParameterInfo
>
parameters
;
std
::
vector
<
ParameterInfo
>
arguments
;
std
::
string
kernelSource
;
std
::
map
<
std
::
string
,
std
::
string
>
kernelDefines
;
double
cutoff
;
bool
useCutoff
,
usePeriodic
,
anyExclusions
;
int
startTileIndex
,
numTiles
,
maxTiles
,
numForceThreadBlocks
,
forceThreadBlockSize
,
nonbondedForceGroup
,
numAtoms
;
bool
useCutoff
,
usePeriodic
,
anyExclusions
,
usePadding
;
int
startTileIndex
,
numTiles
,
startBlockIndex
,
numBlocks
,
maxTiles
,
numForceThreadBlocks
,
forceThreadBlockSize
,
nonbondedForceGroup
,
numAtoms
;
};
/**
...
...
platforms/cuda/src/CudaParallelKernels.cpp
View file @
93c467b2
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2011-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -118,7 +118,7 @@ private:
};
CudaParallelCalcForcesAndEnergyKernel
::
CudaParallelCalcForcesAndEnergyKernel
(
string
name
,
const
Platform
&
platform
,
CudaPlatform
::
PlatformData
&
data
)
:
CalcForcesAndEnergyKernel
(
name
,
platform
),
data
(
data
),
completionTimes
(
data
.
contexts
.
size
()),
context
Tile
s
(
data
.
contexts
.
size
()),
contextForces
(
NULL
),
CalcForcesAndEnergyKernel
(
name
,
platform
),
data
(
data
),
completionTimes
(
data
.
contexts
.
size
()),
context
NonbondedFraction
s
(
data
.
contexts
.
size
()),
contextForces
(
NULL
),
pinnedPositionBuffer
(
NULL
),
pinnedForceBuffer
(
NULL
)
{
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
kernels
.
push_back
(
Kernel
(
new
CudaCalcForcesAndEnergyKernel
(
name
,
platform
,
*
data
.
contexts
[
i
])));
...
...
@@ -141,6 +141,8 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
sumKernel
=
cu
.
getKernel
(
module
,
"sumForces"
);
for
(
int
i
=
0
;
i
<
(
int
)
kernels
.
size
();
i
++
)
getKernel
(
i
).
initialize
(
system
);
for
(
int
i
=
0
;
i
<
(
int
)
contextNonbondedFractions
.
size
();
i
++
)
contextNonbondedFractions
[
i
]
=
1
/
(
double
)
contextNonbondedFractions
.
size
();
}
void
CudaParallelCalcForcesAndEnergyKernel
::
beginComputation
(
ContextImpl
&
context
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
)
{
...
...
@@ -184,30 +186,26 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
void
*
args
[]
=
{
&
cu
.
getForce
().
getDevicePointer
(),
&
contextForces
->
getDevicePointer
(),
&
bufferSize
,
&
numBuffers
};
cu
.
executeKernel
(
sumKernel
,
args
,
bufferSize
);
// Balance work between the contexts by transferring a
few
nonbonded
tiles
from the context that
// Balance work between the contexts by transferring a
little
nonbonded
work
from the context that
// finished last to the one that finished first.
int
firstIndex
=
0
,
lastIndex
=
0
;
int
totalTiles
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
completionTimes
.
size
();
i
++
)
{
if
(
completionTimes
[
i
]
<
completionTimes
[
firstIndex
])
firstIndex
=
i
;
if
(
completionTimes
[
i
]
>
completionTimes
[
lastIndex
])
lastIndex
=
i
;
contextTiles
[
i
]
=
data
.
contexts
[
i
]
->
getNonbondedUtilities
().
getNumTiles
();
totalTiles
+=
contextTiles
[
i
];
}
int
tilesToTransfer
=
totalTiles
/
1000
;
if
(
tilesToTransfer
<
1
)
tilesToTransfer
=
1
;
if
(
tilesToTransfer
>
contextTiles
[
lastIndex
])
tilesToTransfer
=
contextTiles
[
lastIndex
];
contextTiles
[
firstIndex
]
+=
tilesToTransfer
;
contextTiles
[
lastIndex
]
-=
tilesToTransfer
;
int
startIndex
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
contextTiles
.
size
();
i
++
)
{
data
.
contexts
[
i
]
->
getNonbondedUtilities
().
setTileRange
(
startIndex
,
contextTiles
[
i
]);
startIndex
+=
contextTiles
[
i
];
double
fractionToTransfer
=
min
(
0.001
,
contextNonbondedFractions
[
lastIndex
]);
contextNonbondedFractions
[
firstIndex
]
+=
fractionToTransfer
;
contextNonbondedFractions
[
lastIndex
]
-=
fractionToTransfer
;
double
startFraction
=
0.0
;
for
(
int
i
=
0
;
i
<
(
int
)
contextNonbondedFractions
.
size
();
i
++
)
{
double
endFraction
=
startFraction
+
contextNonbondedFractions
[
i
];
if
(
i
==
contextNonbondedFractions
.
size
()
-
1
)
endFraction
=
1.0
;
// Avoid roundoff error
data
.
contexts
[
i
]
->
getNonbondedUtilities
().
setAtomBlockRange
(
startFraction
,
endFraction
);
startFraction
=
endFraction
;
}
}
return
energy
;
...
...
platforms/cuda/src/CudaParallelKernels.h
View file @
93c467b2
...
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2011-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -80,7 +80,7 @@ private:
CudaPlatform
::
PlatformData
&
data
;
std
::
vector
<
Kernel
>
kernels
;
std
::
vector
<
long
long
>
completionTimes
;
std
::
vector
<
int
>
context
Tile
s
;
std
::
vector
<
double
>
context
NonbondedFraction
s
;
CudaArray
*
contextForces
;
void
*
pinnedPositionBuffer
;
long
long
*
pinnedForceBuffer
;
...
...
platforms/cuda/src/CudaSort.cpp
View file @
93c467b2
...
...
@@ -32,7 +32,7 @@ using namespace OpenMM;
using
namespace
std
;
CudaSort
::
CudaSort
(
CudaContext
&
context
,
SortTrait
*
trait
,
unsigned
int
length
)
:
context
(
context
),
trait
(
trait
),
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
)
{
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
)
,
dataLength
(
length
)
{
// Create kernels.
map
<
string
,
string
>
replacements
;
...
...
@@ -43,6 +43,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
replacements
[
"MAX_KEY"
]
=
trait
->
getMaxKey
();
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
CUmodule
module
=
context
.
createModule
(
context
.
replaceStrings
(
CudaKernelSources
::
sort
,
replacements
));
shortListKernel
=
context
.
getKernel
(
module
,
"sortShortList"
);
computeRangeKernel
=
context
.
getKernel
(
module
,
"computeRange"
);
assignElementsKernel
=
context
.
getKernel
(
module
,
"assignElementsToBuckets"
);
computeBucketPositionsKernel
=
context
.
getKernel
(
module
,
"computeBucketPositions"
);
...
...
@@ -53,15 +54,16 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
int
maxBlockSize
;
cuDeviceGetAttribute
(
&
maxBlockSize
,
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X
,
context
.
getDevice
());
int
maxSharedMem
;
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
isShortList
=
(
length
<=
maxLocalBuffer
);
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxBlockSize
;
rangeKernelSize
*=
2
)
;
positionsKernelSize
=
rangeKernelSize
;
sortKernelSize
=
rangeKernelSize
/
2
;
sortKernelSize
=
(
isShortList
?
rangeKernelSize
/
2
:
rangeKernelSize
/
4
)
;
if
(
rangeKernelSize
>
length
)
rangeKernelSize
=
length
;
int
maxSharedMem
;
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
if
(
sortKernelSize
>
maxLocalBuffer
)
sortKernelSize
=
maxLocalBuffer
;
unsigned
int
targetBucketSize
=
sortKernelSize
/
2
;
...
...
@@ -73,11 +75,13 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
// Create workspace arrays.
dataRange
=
new
CudaArray
(
context
,
2
,
trait
->
getKeySize
(),
"sortDataRange"
);
bucketOffset
=
CudaArray
::
create
<
uint1
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOfElement
=
CudaArray
::
create
<
uint1
>
(
context
,
length
,
"bucketOfElement"
);
offsetInBucket
=
CudaArray
::
create
<
uint1
>
(
context
,
length
,
"offsetInBucket"
);
buckets
=
new
CudaArray
(
context
,
length
,
trait
->
getDataSize
(),
"buckets"
);
if
(
!
isShortList
)
{
dataRange
=
new
CudaArray
(
context
,
2
,
trait
->
getKeySize
(),
"sortDataRange"
);
bucketOffset
=
CudaArray
::
create
<
uint1
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOfElement
=
CudaArray
::
create
<
uint1
>
(
context
,
length
,
"bucketOfElement"
);
offsetInBucket
=
CudaArray
::
create
<
uint1
>
(
context
,
length
,
"offsetInBucket"
);
buckets
=
new
CudaArray
(
context
,
length
,
trait
->
getDataSize
(),
"buckets"
);
}
}
CudaSort
::~
CudaSort
()
{
...
...
@@ -95,38 +99,44 @@ CudaSort::~CudaSort() {
}
void
CudaSort
::
sort
(
CudaArray
&
data
)
{
if
(
data
.
getSize
()
!=
bucketOfElement
->
getSize
()
||
data
.
getElementSize
()
!=
trait
->
getDataSize
())
if
(
data
.
getSize
()
!=
dataLength
||
data
.
getElementSize
()
!=
trait
->
getDataSize
())
throw
OpenMMException
(
"CudaSort called with different data size"
);
if
(
data
.
getSize
()
==
0
)
return
;
if
(
isShortList
)
{
// We can use a simpler sort kernel that does the entire operation at once in local memory.
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
};
context
.
executeKernel
(
shortListKernel
,
sortArgs
,
sortKernelSize
,
sortKernelSize
,
dataLength
*
trait
->
getDataSize
());
}
else
{
// Compute the range of data values.
// Compute the range of data values.
unsigned
int
dataSize
=
data
.
getSize
();
void
*
rangeArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataSize
,
&
dataRange
->
getDevicePointer
()};
context
.
executeKernel
(
computeRangeKernel
,
rangeArgs
,
rangeKernelSize
,
rangeKernelSize
,
rangeKernelSize
*
trait
->
getKeySize
());
void
*
rangeArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
,
&
dataRange
->
getDevicePointer
()};
context
.
executeKernel
(
computeRangeKernel
,
rangeArgs
,
rangeKernelSize
,
rangeKernelSize
,
rangeKernelSize
*
trait
->
getKeySize
());
// Assign array elements to buckets.
// Assign array elements to buckets.
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
context
.
clearBuffer
(
*
bucketOffset
);
void
*
elementsArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
data
Size
,
&
numBuckets
,
&
dataRange
->
getDevicePointer
(),
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
assignElementsKernel
,
elementsArgs
,
data
.
getSize
());
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
context
.
clearBuffer
(
*
bucketOffset
);
void
*
elementsArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
data
Length
,
&
numBuckets
,
&
dataRange
->
getDevicePointer
(),
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
assignElementsKernel
,
elementsArgs
,
data
.
getSize
());
// Compute the position of each bucket.
// Compute the position of each bucket.
void
*
computeArgs
[]
=
{
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
context
.
executeKernel
(
computeBucketPositionsKernel
,
computeArgs
,
positionsKernelSize
,
positionsKernelSize
,
positionsKernelSize
*
sizeof
(
int
));
void
*
computeArgs
[]
=
{
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
context
.
executeKernel
(
computeBucketPositionsKernel
,
computeArgs
,
positionsKernelSize
,
positionsKernelSize
,
positionsKernelSize
*
sizeof
(
int
));
// Copy the data into the buckets.
// Copy the data into the buckets.
void
*
copyArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
data
Size
,
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
copyToBucketsKernel
,
copyArgs
,
data
.
getSize
());
void
*
copyArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
data
Length
,
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
copyToBucketsKernel
,
copyArgs
,
data
.
getSize
());
// Sort each bucket.
// Sort each bucket.
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
context
.
executeKernel
(
sortBucketsKernel
,
sortArgs
,
((
data
.
getSize
()
+
sortKernelSize
-
1
)
/
sortKernelSize
)
*
sortKernelSize
,
sortKernelSize
,
sortKernelSize
*
trait
->
getDataSize
());
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
context
.
executeKernel
(
sortBucketsKernel
,
sortArgs
,
((
data
.
getSize
()
+
sortKernelSize
-
1
)
/
sortKernelSize
)
*
sortKernelSize
,
sortKernelSize
,
sortKernelSize
*
trait
->
getDataSize
());
}
}
platforms/cuda/src/CudaSort.h
View file @
93c467b2
...
...
@@ -92,8 +92,9 @@ private:
CudaArray
*
offsetInBucket
;
CudaArray
*
bucketOffset
;
CudaArray
*
buckets
;
CUfunction
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
unsigned
int
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
CUfunction
shortListKernel
,
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
unsigned
int
dataLength
,
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
bool
isShortList
;
};
/**
...
...
platforms/cuda/src/kernels/coulombLennardJones.cu
View file @
93c467b2
#if USE_EWALD
bool
needCorrection
=
isExcluded
&&
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
;
bool
needCorrection
=
hasExclusions
&&
isExcluded
&&
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
;
if
(
!
isExcluded
||
needCorrection
)
{
real
tempForce
=
0.0
f
;
if
(
r2
<
CUTOFF_SQUARED
||
needCorrection
)
{
const
real
alphaR
=
EWALD_ALPHA
*
r
;
const
real
expAlphaRSqr
=
EXP
(
-
alphaR
*
alphaR
);
...
...
@@ -16,6 +15,7 @@ if (!isExcluded || needCorrection) {
t
*=
t
;
t
*=
t
;
const
real
erfcAlphaR
=
RECIP
(
t
*
t
);
real
tempForce
=
0.0
f
;
if
(
needCorrection
)
{
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
...
...
@@ -36,8 +36,8 @@ if (!isExcluded || needCorrection) {
tempEnergy
+=
prefactor
*
erfcAlphaR
;
#endif
}
dEdR
+=
tempForce
*
invR
*
invR
;
}
dEdR
+=
tempForce
*
invR
*
invR
;
}
#else
{
...
...
platforms/cuda/src/kernels/customGBEnergyN2.cu
View file @
93c467b2
#define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0x100000000)));
#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0x100000000)));
#define TILE_SIZE 32
typedef
struct
{
real4
posq
;
...
...
@@ -15,169 +14,305 @@ typedef struct {
* Compute a force based on pair interactions.
*/
extern
"C"
__global__
void
computeN2Energy
(
unsigned
long
long
*
__restrict__
forceBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
unsigned
int
*
__restrict__
exclusionIndices
,
const
unsigned
int
*
__restrict__
exclusionRowIndices
,
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
ushort2
*
__restrict__
exclusionTiles
,
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interacti
onFlag
s
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
unsigned
int
*
__restrict__
interacti
ngAtom
s
#else
unsigned
int
numTiles
#endif
PARAMETER_ARGUMENTS
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
unsigned
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
unsigned
int
pos
=
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
// First loop: process tiles that contain exclusions.
do
{
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
u
nsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
unsigned
int
x
,
y
;
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
u
short2
tileIndices
=
exclusionTiles
[
pos
]
;
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
real3
force
=
make_real3
(
0
);
DECLARE_ATOM1_DERIVATIVES
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
exclusions
[
pos
*
TILE_SIZE
+
tgx
];
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
}
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
if
(
x
==
y
)
{
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
localData
[
localAtomIndex
].
posq
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
j
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
j
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
#ifdef USE_EXCLUSIONS
if
(
tgx
<
2
)
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
bool
hasExclusions
=
(
exclusionIndex
[
localGroupIndex
]
>
-
1
);
#else
bool
hasExclusions
=
false
;
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
COMPUTE_INTERACTION
dEdR
/=
-
r
;
}
energy
+=
0.5
f
*
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
#ifdef USE_CUTOFF
}
#endif
if
(
pos
>=
end
)
;
// This warp is done.
else
if
(
x
==
y
)
{
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
localData
[
localAtomIndex
].
posq
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
exclusions
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
;
excl
>>=
1
;
#endif
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
}
}
else
{
// This is an off-diagonal tile.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
force
=
make_real3
(
0
);
CLEAR_LOCAL_DERIVATIVES
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
)
);
#endif
int
atom2
=
tbx
+
j
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
j
;
atom2
=
y
*
TILE_SIZE
+
t
j
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-
r
;
}
energy
+=
0.5
f
*
tempEnergy
;
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
atom2
=
tbx
+
tj
;
localData
[
atom2
].
force
.
x
+=
delta
.
x
;
localData
[
atom2
].
force
.
y
+=
delta
.
y
;
localData
[
atom2
].
force
.
z
+=
delta
.
z
;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
}
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
excl
>>=
1
;
#endif
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
else
{
// This is an off-diagonal tile.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData
[
localAtomIndex
].
force
=
make_real3
(
0
);
CLEAR_LOCAL_DERIVATIVES
}
// Write results.
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
STORE_DERIVATIVES_1
if
(
x
!=
y
)
{
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
STORE_DERIVATIVES_2
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
if
(
!
hasExclusions
&&
flags
==
0
)
{
// No interactions in this tile.
unsigned
int
numTiles
=
interactionCount
[
0
];
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
int
pos
=
warp
*
numTiles
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
__shared__
int
skipTiles
[
THREAD_BLOCK_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
while
(
pos
<
end
)
{
const
bool
isExcluded
=
false
;
real3
force
=
make_real3
(
0
);
DECLARE_ATOM1_DERIVATIVES
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
unsigned
int
x
,
y
;
bool
singlePeriodicCopy
=
false
;
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
singlePeriodicCopy
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
{
// Compute the full set of interactions in this tile.
atomIndices
[
threadIdx
.
x
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
force
=
make_real3
(
0
);
CLEAR_LOCAL_DERIVATIVES
}
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
0xFFFFFFFF
);
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
));
real4
blockCenterX
=
blockCenter
[
x
];
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
localData
[
threadIdx
.
x
].
posq
.
x
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
localData
[
threadIdx
.
x
].
posq
.
y
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
localData
[
threadIdx
.
x
].
posq
.
z
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices
[
tbx
+
tj
];
real
dEdR
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-
r
;
}
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
atom2
=
tbx
+
tj
;
localData
[
atom2
].
force
.
x
+=
delta
.
x
;
localData
[
atom2
].
force
.
y
+=
delta
.
y
;
localData
[
atom2
].
force
.
z
+=
delta
.
z
;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
tj
;
atom2
=
atomIndices
[
tbx
+
tj
]
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
...
...
@@ -195,35 +330,33 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
localData
[
atom2
].
force
.
z
+=
delta
.
z
;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
lasty
=
y
;
// Write results.
if
(
pos
<
end
)
{
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
// Write results.
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
unsigned
int
offset
=
atom1
;
STORE_DERIVATIVES_1
}
if
(
pos
<
end
&&
x
!=
y
)
{
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
STORE_DERIVATIVES_2
#ifdef USE_CUTOFF
unsigned
int
atom2
=
atomIndices
[
threadIdx
.
x
];
#else
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
#endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
offset
=
atom2
;
STORE_DERIVATIVES_2
}
}
pos
++
;
}
while
(
pos
<
end
);
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
platforms/cuda/src/kernels/customGBValueN2.cu
View file @
93c467b2
#define TILE_SIZE 32
typedef
struct
{
real4
posq
;
real
value
,
temp
;
...
...
@@ -13,238 +11,295 @@ typedef struct {
* Compute a value based on pair interactions.
*/
extern
"C"
__global__
void
computeN2Value
(
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
u
nsigned
int
*
__restrict__
exclusion
Indices
,
const
unsigned
int
*
__restrict__
exclusionRowIndic
es
,
unsigned
long
long
*
__restrict__
global_value
,
const
u
short2
*
__restrict__
exclusion
Til
es
,
unsigned
long
long
*
__restrict__
global_value
,
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interacti
onFlag
s
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
unsigned
int
*
__restrict__
interacti
ngAtom
s
#else
unsigned
int
numTiles
#endif
PARAMETER_ARGUMENTS
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
unsigned
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
unsigned
int
pos
=
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
// First loop: process tiles that contain exclusions.
do
{
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
u
nsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
unsigned
int
x
,
y
;
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
u
short2
tileIndices
=
exclusionTiles
[
pos
]
;
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
real
value
=
0
;
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
exclusions
[
pos
*
TILE_SIZE
+
tgx
];
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
}
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
if
(
x
==
y
)
{
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
localData
[
localAtomIndex
].
posq
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
j
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
j
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#ifdef USE_EXCLUSIONS
if
(
tgx
<
2
)
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
bool
hasExclusions
=
(
exclusionIndex
[
localGroupIndex
]
>
-
1
);
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
||
atom2
>=
NUM_ATOMS
||
!
(
excl
&
0x1
));
if
(
!
isExcluded
&&
atom1
!=
atom2
)
{
#else
bool
hasExclusions
=
false
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
#endif
COMPUTE_VALUE
}
value
+=
tempValue1
;
#ifdef USE_CUTOFF
}
#endif
if
(
pos
>=
end
)
;
// This warp is done.
else
if
(
x
==
y
)
{
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
localData
[
localAtomIndex
].
posq
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
exclusions
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
;
excl
>>=
1
;
#endif
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
}
}
else
{
// This is an off-diagonal tile.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
value
=
0
;
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
)
);
#endif
int
atom2
=
tbx
+
j
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
j
;
atom2
=
y
*
TILE_SIZE
+
t
j
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#ifdef USE_EXCLUSIONS
if
(
!
isExcluded
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
||
atom2
>=
NUM_ATOMS
||
!
(
excl
&
0x1
));
if
(
!
isExcluded
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#endif
COMPUTE_VALUE
}
value
+=
tempValue1
;
localData
[
tbx
+
tj
].
value
+=
tempValue2
;
#ifdef USE_CUTOFF
}
}
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
excl
>>=
1
;
#endif
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
else
{
// This is an off-diagonal tile.
}
// Write results.
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
global_value
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
value
*
0x100000000
)));
if
(
x
!=
y
)
{
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
global_value
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
value
*
0x100000000
)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
localData
[
threadIdx
.
x
].
posq
=
posq
[
j
];
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData
[
threadIdx
.
x
].
value
=
0
;
#ifdef USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
if
(
!
hasExclusions
&&
flags
!=
0xFFFFFFFF
)
{
if
(
flags
==
0
)
{
// No interactions in this tile.
}
else
{
// Compute only a subset of the interactions in this tile.
unsigned
int
numTiles
=
interactionCount
[
0
];
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
int
pos
=
warp
*
numTiles
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
__shared__
int
skipTiles
[
THREAD_BLOCK_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
while
(
pos
<
end
)
{
real
value
=
0
;
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
unsigned
int
x
,
y
;
bool
singlePeriodicCopy
=
false
;
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
singlePeriodicCopy
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
int
atom2
=
tbx
+
j
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
j
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_VALUE
}
value
+=
tempValue1
;
}
localData
[
threadIdx
.
x
].
temp
=
tempValue2
;
// Skip over tiles that have exclusions, since they were already processed.
// Sum the forces on atom2.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
value
=
0
;
}
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
if
(
tgx
%
4
==
0
)
localData
[
threadIdx
.
x
].
temp
+=
localData
[
threadIdx
.
x
+
1
].
temp
+
localData
[
threadIdx
.
x
+
2
].
temp
+
localData
[
threadIdx
.
x
+
3
].
temp
;
if
(
tgx
==
0
)
localData
[
tbx
+
j
].
value
+=
localData
[
threadIdx
.
x
].
temp
+
localData
[
threadIdx
.
x
+
4
].
temp
+
localData
[
threadIdx
.
x
+
8
].
temp
+
localData
[
threadIdx
.
x
+
12
].
temp
+
localData
[
threadIdx
.
x
+
16
].
temp
+
localData
[
threadIdx
.
x
+
20
].
temp
+
localData
[
threadIdx
.
x
+
24
].
temp
+
localData
[
threadIdx
.
x
+
28
].
temp
;
}
real4
blockCenterX
=
blockCenter
[
x
];
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
localData
[
threadIdx
.
x
].
posq
.
x
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
localData
[
threadIdx
.
x
].
posq
.
y
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
localData
[
threadIdx
.
x
].
posq
.
z
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices
[
tbx
+
tj
];
real
tempValue1
=
0
;
real
tempValue2
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_VALUE
}
value
+=
tempValue1
;
localData
[
tbx
+
tj
].
value
+=
tempValue2
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
else
}
else
#endif
{
// Compute the full set of interactions in this tile
.
{
// We need to apply periodic boundary conditions separately for each interaction
.
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
0xFFFFFFFF
);
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
));
#endif
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
tj
;
atom2
=
atomIndices
[
tbx
+
tj
]
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#ifdef USE_EXCLUSIONS
if
(
!
isExcluded
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#endif
COMPUTE_VALUE
}
value
+=
tempValue1
;
localData
[
tbx
+
tj
].
value
+=
tempValue2
;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
// Write results.
if
(
pos
<
end
)
{
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
global_value
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
value
*
0x100000000
)));
}
if
(
pos
<
end
&&
x
!=
y
)
{
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
global_value
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
value
*
0x100000000
)));
// Write results.
atomicAdd
(
&
global_value
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
value
*
0x100000000
)));
#ifdef USE_CUTOFF
unsigned
int
atom2
=
atomIndices
[
threadIdx
.
x
];
#else
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
#endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
atomicAdd
(
&
global_value
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
value
*
0x100000000
)));
}
lasty
=
y
;
pos
++
;
}
while
(
pos
<
end
);
}
}
platforms/cuda/src/kernels/customHbondForce.cu
View file @
93c467b2
...
...
@@ -48,12 +48,12 @@ inline __device__ real computeAngle(real4 vec1, real4 vec2) {
real3
crossProduct
=
cross
(
vec1
,
vec2
);
real
scale
=
vec1
.
w
*
vec2
.
w
;
angle
=
asin
(
SQRT
(
dot
(
crossProduct
,
crossProduct
)
/
scale
));
angle
=
ASIN
(
SQRT
(
dot
(
crossProduct
,
crossProduct
)
/
scale
));
if
(
cosine
<
0.0
f
)
angle
=
M_PI
-
angle
;
}
else
angle
=
acos
(
cosine
);
angle
=
ACOS
(
cosine
);
return
angle
;
}
...
...
platforms/cuda/src/kernels/ewald.cu
View file @
93c467b2
...
...
@@ -35,11 +35,11 @@ extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuf
for
(
int
atom
=
0
;
atom
<
NUM_ATOMS
;
atom
++
)
{
real4
apos
=
posq
[
atom
];
real
phase
=
apos
.
x
*
kx
;
real2
structureFactor
=
make_real2
(
cos
(
phase
),
sin
(
phase
));
real2
structureFactor
=
make_real2
(
COS
(
phase
),
SIN
(
phase
));
phase
=
apos
.
y
*
ky
;
structureFactor
=
multofReal2
(
structureFactor
,
make_real2
(
cos
(
phase
),
sin
(
phase
)));
structureFactor
=
multofReal2
(
structureFactor
,
make_real2
(
COS
(
phase
),
SIN
(
phase
)));
phase
=
apos
.
z
*
kz
;
structureFactor
=
multofReal2
(
structureFactor
,
make_real2
(
cos
(
phase
),
sin
(
phase
)));
structureFactor
=
multofReal2
(
structureFactor
,
make_real2
(
COS
(
phase
),
SIN
(
phase
)));
sum
+=
apos
.
w
*
structureFactor
;
}
cosSinSum
[
index
]
=
sum
;
...
...
@@ -76,9 +76,9 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
for
(
int
ry
=
lowry
;
ry
<
KMAX_Y
;
ry
++
)
{
real
ky
=
ry
*
reciprocalBoxSize
.
y
;
real
phase
=
apos
.
x
*
kx
;
real2
tab_xy
=
make_real2
(
cos
(
phase
),
sin
(
phase
));
real2
tab_xy
=
make_real2
(
COS
(
phase
),
SIN
(
phase
));
phase
=
apos
.
y
*
ky
;
tab_xy
=
multofReal2
(
tab_xy
,
make_real2
(
cos
(
phase
),
sin
(
phase
)));
tab_xy
=
multofReal2
(
tab_xy
,
make_real2
(
COS
(
phase
),
SIN
(
phase
)));
for
(
int
rz
=
lowrz
;
rz
<
KMAX_Z
;
rz
++
)
{
real
kz
=
rz
*
reciprocalBoxSize
.
z
;
...
...
@@ -88,7 +88,7 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
real
k2
=
kx
*
kx
+
ky
*
ky
+
kz
*
kz
;
real
ak
=
EXP
(
k2
*
EXP_COEFFICIENT
)
/
k2
;
phase
=
apos
.
z
*
kz
;
real2
structureFactor
=
multofReal2
(
tab_xy
,
make_real2
(
cos
(
phase
),
sin
(
phase
)));
real2
structureFactor
=
multofReal2
(
tab_xy
,
make_real2
(
COS
(
phase
),
SIN
(
phase
)));
real2
sum
=
cosSinSum
[
index
];
real
dEdR
=
2
*
reciprocalCoefficient
*
ak
*
apos
.
w
*
(
sum
.
x
*
structureFactor
.
y
-
sum
.
y
*
structureFactor
.
x
);
force
.
x
+=
dEdR
*
kx
;
...
...
platforms/cuda/src/kernels/findInteractingBlocks.cu
View file @
93c467b2
#define TILE_SIZE 32
#define GROUP_SIZE 64
#define BUFFER_GROUPS 4
#define GROUP_SIZE 256
#define BUFFER_GROUPS 2
#define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
#define WARP_SIZE 32
#define INVALID 0xFFFF
/**
* Find a bounding box for the atoms in each block.
*/
extern
"C"
__global__
void
findBlockBounds
(
int
numAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
real4
*
__restrict__
posq
,
real4
*
__restrict__
blockCenter
,
real4
*
__restrict__
blockBoundingBox
,
unsigned
int
*
__restrict__
interactionCount
)
{
extern
"C"
__global__
void
findBlockBounds
(
int
numAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
real4
*
__restrict__
posq
,
real4
*
__restrict__
blockCenter
,
real4
*
__restrict__
blockBoundingBox
,
int
*
__restrict__
rebuildNeighborList
,
real2
*
__restrict__
sortedBlocks
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
base
=
index
*
TILE_SIZE
;
while
(
base
<
numAtoms
)
{
...
...
@@ -30,68 +32,231 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
minPos
=
make_real4
(
min
(
minPos
.
x
,
pos
.
x
),
min
(
minPos
.
y
,
pos
.
y
),
min
(
minPos
.
z
,
pos
.
z
),
0
);
maxPos
=
make_real4
(
max
(
maxPos
.
x
,
pos
.
x
),
max
(
maxPos
.
y
,
pos
.
y
),
max
(
maxPos
.
z
,
pos
.
z
),
0
);
}
blockBoundingBox
[
index
]
=
0.5
f
*
(
maxPos
-
minPos
);
real4
blockSize
=
0.5
f
*
(
maxPos
-
minPos
);
blockBoundingBox
[
index
]
=
blockSize
;
blockCenter
[
index
]
=
0.5
f
*
(
maxPos
+
minPos
);
sortedBlocks
[
index
]
=
make_real2
(
blockSize
.
x
+
blockSize
.
y
+
blockSize
.
z
,
index
);
index
+=
blockDim
.
x
*
gridDim
.
x
;
base
=
index
*
TILE_SIZE
;
}
if
(
blockIdx
.
x
==
0
&&
threadIdx
.
x
==
0
)
interactionCoun
t
[
0
]
=
0
;
rebuildNeighborLis
t
[
0
]
=
0
;
}
/**
* This is called by findBlocksWithInteractions(). It compacts the list of blocks and writes them
* to global memory.
* Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
*/
__device__
void
storeInteractionData
(
ushort2
*
buffer
,
int
*
valid
,
short
*
sum
,
ushort2
*
temp
,
int
*
baseIndex
,
unsigned
int
*
interactionCount
,
ushort2
*
interactingTiles
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
real4
*
posq
,
const
real4
*
blockCenter
,
const
real4
*
blockBoundingBox
,
unsigned
int
maxTiles
)
{
// The buffer is full, so we need to compact it and write out results. Start by doing a parallel prefix sum.
extern
"C"
__global__
void
sortBoxData
(
const
real2
*
__restrict__
sortedBlock
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockBoundingBox
,
real4
*
__restrict__
sortedBlockCenter
,
real4
*
__restrict__
sortedBlockBoundingBox
,
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
oldPositions
,
unsigned
int
*
__restrict__
interactionCount
,
int
*
__restrict__
rebuildNeighborList
)
{
for
(
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
i
<
NUM_BLOCKS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
index
=
(
int
)
sortedBlock
[
i
].
y
;
sortedBlockCenter
[
i
]
=
blockCenter
[
index
];
sortedBlockBoundingBox
[
i
]
=
blockBoundingBox
[
index
];
}
// Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
bool
rebuild
=
false
;
for
(
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
real4
delta
=
oldPositions
[
i
]
-
posq
[
i
];
if
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
>
0.25
f
*
PADDING
*
PADDING
)
rebuild
=
true
;
}
if
(
rebuild
)
{
rebuildNeighborList
[
0
]
=
1
;
interactionCount
[
0
]
=
0
;
}
}
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
GROUP_SIZE
)
temp
[
i
].
x
=
(
valid
[
i
]
?
1
:
0
);
/**
* Perform a parallel prefix sum over an array. The input values are all assumed to be 0 or 1.
*/
__device__
void
prefixSum
(
short
*
sum
,
ushort2
*
temp
)
{
#if __CUDA_ARCH__ >= 300
const
int
indexInWarp
=
threadIdx
.
x
%
WARP_SIZE
;
const
int
warpMask
=
(
2
<<
indexInWarp
)
-
1
;
for
(
int
base
=
0
;
base
<
BUFFER_SIZE
;
base
+=
blockDim
.
x
)
temp
[
base
+
threadIdx
.
x
].
x
=
__popc
(
__ballot
(
sum
[
base
+
threadIdx
.
x
])
&
warpMask
);
__syncthreads
();
if
(
threadIdx
.
x
<
BUFFER_SIZE
/
WARP_SIZE
)
{
int
multiWarpSum
=
temp
[(
threadIdx
.
x
+
1
)
*
WARP_SIZE
-
1
].
x
;
for
(
int
offset
=
1
;
offset
<
BUFFER_SIZE
/
WARP_SIZE
;
offset
*=
2
)
{
short
n
=
__shfl_up
(
multiWarpSum
,
offset
,
WARP_SIZE
);
if
(
indexInWarp
>=
offset
)
multiWarpSum
+=
n
;
}
temp
[
threadIdx
.
x
].
y
=
multiWarpSum
;
}
__syncthreads
();
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
sum
[
i
]
=
temp
[
i
].
x
+
(
i
<
WARP_SIZE
?
0
:
temp
[
i
/
WARP_SIZE
-
1
].
y
);
__syncthreads
();
#else
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
temp
[
i
].
x
=
sum
[
i
];
__syncthreads
();
int
whichBuffer
=
0
;
for
(
int
offset
=
1
;
offset
<
BUFFER_SIZE
;
offset
*=
2
)
{
if
(
whichBuffer
==
0
)
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
GROUP_SIZE
)
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
temp
[
i
].
y
=
(
i
<
offset
?
temp
[
i
].
x
:
temp
[
i
].
x
+
temp
[
i
-
offset
].
x
);
else
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
GROUP_SIZE
)
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
temp
[
i
].
x
=
(
i
<
offset
?
temp
[
i
].
y
:
temp
[
i
].
y
+
temp
[
i
-
offset
].
y
);
whichBuffer
=
1
-
whichBuffer
;
__syncthreads
();
}
if
(
whichBuffer
==
0
)
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
GROUP_SIZE
)
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
sum
[
i
]
=
temp
[
i
].
x
;
else
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
GROUP_SIZE
)
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
sum
[
i
]
=
temp
[
i
].
y
;
__syncthreads
();
int
numValid
=
sum
[
BUFFER_SIZE
-
1
];
__syncthreads
();
// Compact the buffer.
#endif
}
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
GROUP_SIZE
)
if
(
valid
[
i
])
{
temp
[
sum
[
i
]
-
1
]
=
buffer
[
i
];
sum
[
i
]
=
valid
[
i
];
valid
[
i
]
=
false
;
buffer
[
i
]
=
make_ushort2
(
1
,
1
);
/**
* This is called by findBlocksWithInteractions(). It compacts the list of blocks, identifies interactions
* in them, and writes the result to global memory.
*/
__device__
void
storeInteractionData
(
unsigned
short
x
,
unsigned
short
*
buffer
,
short
*
sum
,
ushort2
*
temp
,
int
*
atoms
,
int
&
numAtoms
,
int
&
baseIndex
,
unsigned
int
*
interactionCount
,
ushort2
*
interactingTiles
,
unsigned
int
*
interactingAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
real4
*
posq
,
real3
*
posBuffer
,
real4
blockCenterX
,
real4
blockSizeX
,
unsigned
int
maxTiles
,
bool
finish
)
{
const
bool
singlePeriodicCopy
=
(
0.5
f
*
periodicBoxSize
.
x
-
blockSizeX
.
x
>=
PADDED_CUTOFF
&&
0.5
f
*
periodicBoxSize
.
y
-
blockSizeX
.
y
>=
PADDED_CUTOFF
&&
0.5
f
*
periodicBoxSize
.
z
-
blockSizeX
.
z
>=
PADDED_CUTOFF
);
if
(
threadIdx
.
x
<
TILE_SIZE
)
{
real3
pos
=
trimTo3
(
posq
[
x
*
TILE_SIZE
+
threadIdx
.
x
]);
posBuffer
[
threadIdx
.
x
]
=
pos
;
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
pos
.
x
-=
floor
((
pos
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
((
pos
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
((
pos
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
posBuffer
[
threadIdx
.
x
]
=
pos
;
}
#endif
}
// The buffer is full, so we need to compact it and write out results. Start by doing a parallel prefix sum.
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
sum
[
i
]
=
(
buffer
[
i
]
==
INVALID
?
0
:
1
);
__syncthreads
();
prefixSum
(
sum
,
temp
);
int
numValid
=
sum
[
BUFFER_SIZE
-
1
];
//
Store it to global memory
.
//
Compact the buffer
.
if
(
threadIdx
.
x
==
0
)
*
baseIndex
=
atomicAdd
(
interactionCount
,
numValid
);
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
if
(
buffer
[
i
]
!=
INVALID
)
temp
[
sum
[
i
]
-
1
].
x
=
buffer
[
i
];
__syncthreads
();
if
(
*
baseIndex
+
numValid
<=
maxTiles
)
for
(
int
i
=
threadIdx
.
x
;
i
<
numValid
;
i
+=
GROUP_SIZE
)
interactingTiles
[
*
baseIndex
+
i
]
=
temp
[
i
];
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
buffer
[
i
]
=
temp
[
i
].
x
;
__syncthreads
();
// Loop over the tiles and find specific interactions in them.
const
int
indexInWarp
=
threadIdx
.
x
%
WARP_SIZE
;
for
(
int
base
=
0
;
base
<
numValid
;
base
+=
BUFFER_SIZE
/
WARP_SIZE
)
{
for
(
int
i
=
threadIdx
.
x
/
WARP_SIZE
;
i
<
BUFFER_SIZE
/
WARP_SIZE
&&
base
+
i
<
numValid
;
i
+=
GROUP_SIZE
/
WARP_SIZE
)
{
// Check each atom in block Y for interactions.
real3
pos
=
trimTo3
(
posq
[
buffer
[
base
+
i
]
*
TILE_SIZE
+
indexInWarp
]);
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
pos
.
x
-=
floor
((
pos
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
((
pos
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
((
pos
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
}
#endif
bool
interacts
=
false
;
#ifdef USE_PERIODIC
if
(
!
singlePeriodicCopy
)
{
for
(
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
pos
-
posBuffer
[
j
];
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
);
}
}
else
{
#endif
for
(
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
pos
-
posBuffer
[
j
];
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
);
}
#ifdef USE_PERIODIC
}
#endif
sum
[
i
*
WARP_SIZE
+
indexInWarp
]
=
(
interacts
?
1
:
0
);
}
for
(
int
i
=
numValid
-
base
+
threadIdx
.
x
/
WARP_SIZE
;
i
<
BUFFER_SIZE
/
WARP_SIZE
;
i
+=
GROUP_SIZE
/
WARP_SIZE
)
sum
[
i
*
WARP_SIZE
+
indexInWarp
]
=
0
;
// Compact the list of atoms.
__syncthreads
();
prefixSum
(
sum
,
temp
);
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
if
(
sum
[
i
]
!=
(
i
==
0
?
0
:
sum
[
i
-
1
]))
atoms
[
numAtoms
+
sum
[
i
]
-
1
]
=
buffer
[
base
+
i
/
WARP_SIZE
]
*
TILE_SIZE
+
indexInWarp
;
// Store them to global memory.
int
atomsToStore
=
numAtoms
+
sum
[
BUFFER_SIZE
-
1
];
bool
storePartialTile
=
(
finish
&&
base
>=
numValid
-
BUFFER_SIZE
/
WARP_SIZE
);
int
tilesToStore
=
(
storePartialTile
?
(
atomsToStore
+
TILE_SIZE
-
1
)
/
TILE_SIZE
:
atomsToStore
/
TILE_SIZE
);
if
(
tilesToStore
>
0
)
{
if
(
threadIdx
.
x
==
0
)
baseIndex
=
atomicAdd
(
interactionCount
,
tilesToStore
);
__syncthreads
();
if
(
threadIdx
.
x
==
0
)
numAtoms
=
atomsToStore
-
tilesToStore
*
TILE_SIZE
;
if
(
baseIndex
+
tilesToStore
<=
maxTiles
)
{
if
(
threadIdx
.
x
<
tilesToStore
)
interactingTiles
[
baseIndex
+
threadIdx
.
x
]
=
make_ushort2
(
x
,
singlePeriodicCopy
);
for
(
int
i
=
threadIdx
.
x
;
i
<
tilesToStore
*
TILE_SIZE
;
i
+=
blockDim
.
x
)
interactingAtoms
[
baseIndex
*
TILE_SIZE
+
i
]
=
(
i
<
atomsToStore
?
atoms
[
i
]
:
NUM_ATOMS
);
}
}
else
{
__syncthreads
();
if
(
threadIdx
.
x
==
0
)
numAtoms
+=
sum
[
BUFFER_SIZE
-
1
];
}
__syncthreads
();
if
(
threadIdx
.
x
<
numAtoms
&&
!
storePartialTile
)
atoms
[
threadIdx
.
x
]
=
atoms
[
tilesToStore
*
TILE_SIZE
+
threadIdx
.
x
];
}
if
(
numValid
==
0
&&
numAtoms
>
0
&&
finish
)
{
// We didn't have any more tiles to process, but there were some atoms left over from a
// previous call to this function. Save them now.
if
(
threadIdx
.
x
==
0
)
baseIndex
=
atomicAdd
(
interactionCount
,
1
);
__syncthreads
();
if
(
baseIndex
<
maxTiles
)
{
if
(
threadIdx
.
x
==
0
)
interactingTiles
[
baseIndex
]
=
make_ushort2
(
x
,
singlePeriodicCopy
);
if
(
threadIdx
.
x
<
TILE_SIZE
)
interactingAtoms
[
baseIndex
*
TILE_SIZE
+
threadIdx
.
x
]
=
(
threadIdx
.
x
<
numAtoms
?
atoms
[
threadIdx
.
x
]
:
NUM_ATOMS
);
}
}
// Reset the buffer for processing more tiles.
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
buffer
[
i
]
=
INVALID
;
}
/**
...
...
@@ -100,139 +265,92 @@ __device__ void storeInteractionData(ushort2* buffer, int* valid, short* sum, us
*/
extern
"C"
__global__
void
findBlocksWithInteractions
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockBoundingBox
,
unsigned
int
*
__restrict__
interactionCount
,
ushort2
*
__restrict__
interactingTiles
,
unsigned
int
*
__restrict__
interactionFlags
,
const
real4
*
__restrict__
posq
,
unsigned
int
maxTiles
,
unsigned
int
startTileIndex
,
unsigned
int
numTiles
)
{
__shared__
ushort2
buffer
[
BUFFER_SIZE
];
__shared__
int
valid
[
BUFFER_SIZE
];
unsigned
int
*
__restrict__
interactingAtoms
,
const
real4
*
__restrict__
posq
,
unsigned
int
maxTiles
,
unsigned
int
startBlockIndex
,
unsigned
int
numBlocks
,
real2
*
__restrict__
sortedBlocks
,
const
real4
*
__restrict__
sortedBlockCenter
,
const
real4
*
__restrict__
sortedBlockBoundingBox
,
const
unsigned
int
*
__restrict__
exclusionIndices
,
const
unsigned
int
*
__restrict__
exclusionRowIndices
,
real4
*
__restrict__
oldPositions
,
const
int
*
__restrict__
rebuildNeighborList
)
{
__shared__
unsigned
short
buffer
[
BUFFER_SIZE
];
__shared__
short
sum
[
BUFFER_SIZE
];
__shared__
ushort2
temp
[
BUFFER_SIZE
];
__shared__
int
atoms
[
BUFFER_SIZE
+
TILE_SIZE
];
__shared__
real3
posBuffer
[
TILE_SIZE
];
__shared__
int
exclusionsForX
[
MAX_EXCLUSIONS
];
__shared__
int
bufferFull
;
__shared__
int
globalIndex
;
unsigned
int
endTileIndex
=
startTileIndex
+
numTiles
;
__shared__
int
numAtoms
;
if
(
rebuildNeighborList
[
0
]
==
0
)
return
;
// The neighbor list doesn't need to be rebuilt.
int
valuesInBuffer
=
0
;
if
(
threadIdx
.
x
==
0
)
bufferFull
=
false
;
for
(
int
i
=
0
;
i
<
BUFFER_GROUPS
;
++
i
)
valid
[
i
*
GROUP_SIZE
+
threadIdx
.
x
]
=
false
;
buffer
[
i
*
GROUP_SIZE
+
threadIdx
.
x
]
=
INVALID
;
__syncthreads
();
for
(
int
baseIndex
=
startTileIndex
+
blockIdx
.
x
*
blockDim
.
x
;
baseIndex
<
endTileIndex
;
baseIndex
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Identify the pair of blocks to compare.
int
index
=
baseIndex
+
threadIdx
.
x
;
if
(
index
<
endTileIndex
)
{
unsigned
int
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
sqrt
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
index
));
unsigned
int
x
=
(
index
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
index
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
// Find the distance between the bounding boxes of the two cells.
// Loop over blocks sorted by size.
for
(
int
i
=
startBlockIndex
+
blockIdx
.
x
;
i
<
startBlockIndex
+
numBlocks
;
i
+=
gridDim
.
x
)
{
if
(
threadIdx
.
x
==
blockDim
.
x
-
1
)
numAtoms
=
0
;
real2
sortedKey
=
sortedBlocks
[
i
];
unsigned
short
x
=
(
unsigned
short
)
sortedKey
.
y
;
real4
blockCenterX
=
blockCenter
[
x
];
real4
blockSizeX
=
blockBoundingBox
[
x
];
real4
delta
=
blockCenter
[
x
]
-
blockCenter
[
y
];
real4
boxSizea
=
blockBoundingBox
[
x
];
real4
boxSizeb
=
blockBoundingBox
[
y
];
// Load exclusion data for block x.
const
int
exclusionStart
=
exclusionRowIndices
[
x
];
const
int
exclusionEnd
=
exclusionRowIndices
[
x
+
1
];
const
int
numExclusions
=
exclusionEnd
-
exclusionStart
;
for
(
int
j
=
threadIdx
.
x
;
j
<
numExclusions
;
j
+=
blockDim
.
x
)
exclusionsForX
[
j
]
=
exclusionIndices
[
exclusionStart
+
j
];
__syncthreads
();
// Compare it to other blocks after this one in sorted order.
for
(
int
base
=
i
+
1
;
base
<
NUM_BLOCKS
;
base
+=
blockDim
.
x
)
{
int
j
=
base
+
threadIdx
.
x
;
real2
sortedKey2
=
(
j
<
NUM_BLOCKS
?
sortedBlocks
[
j
]
:
make_real2
(
0
));
real4
blockCenterY
=
(
j
<
NUM_BLOCKS
?
sortedBlockCenter
[
j
]
:
make_real4
(
0
));
real4
blockSizeY
=
(
j
<
NUM_BLOCKS
?
sortedBlockBoundingBox
[
j
]
:
make_real4
(
0
));
unsigned
short
y
=
(
unsigned
short
)
sortedKey2
.
y
;
real4
delta
=
blockCenterX
-
blockCenterY
;
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
delta
.
x
=
max
(
0.0
f
,
fabs
(
delta
.
x
)
-
boxSizea
.
x
-
boxSizeb
.
x
);
delta
.
y
=
max
(
0.0
f
,
fabs
(
delta
.
y
)
-
boxSizea
.
y
-
boxSizeb
.
y
);
delta
.
z
=
max
(
0.0
f
,
fabs
(
delta
.
z
)
-
boxSizea
.
z
-
boxSizeb
.
z
);
if
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
CUTOFF_SQUARED
)
{
delta
.
x
=
max
(
0.0
f
,
fabs
(
delta
.
x
)
-
blockSizeX
.
x
-
blockSizeY
.
x
);
delta
.
y
=
max
(
0.0
f
,
fabs
(
delta
.
y
)
-
blockSizeX
.
y
-
blockSizeY
.
y
);
delta
.
z
=
max
(
0.0
f
,
fabs
(
delta
.
z
)
-
blockSizeX
.
z
-
blockSizeY
.
z
);
bool
hasExclusions
=
false
;
for
(
int
k
=
0
;
k
<
numExclusions
;
k
++
)
hasExclusions
|=
(
exclusionsForX
[
k
]
==
y
);
if
(
j
<
NUM_BLOCKS
&&
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
&&
!
hasExclusions
)
{
// Add this tile to the buffer.
int
bufferIndex
=
valuesInBuffer
*
GROUP_SIZE
+
threadIdx
.
x
;
valid
[
bufferIndex
]
=
true
;
buffer
[
bufferIndex
]
=
make_ushort2
(
x
,
y
);
buffer
[
bufferIndex
]
=
y
;
valuesInBuffer
++
;
if
(
!
bufferFull
&&
valuesInBuffer
==
BUFFER_GROUPS
)
bufferFull
=
true
;
}
}
__syncthreads
();
if
(
bufferFull
)
{
storeInteractionData
(
buffer
,
valid
,
sum
,
temp
,
&
globalIndex
,
interactionCount
,
interactingTiles
,
periodicBoxSize
,
invPeriodicBoxSize
,
posq
,
blockCenter
,
blockBoundingBox
,
maxTiles
);
valuesInBuffer
=
0
;
if
(
threadIdx
.
x
==
0
)
bufferFull
=
false
;
__syncthreads
();
}
}
storeInteractionData
(
buffer
,
valid
,
sum
,
temp
,
&
globalIndex
,
interactionCount
,
interactingTiles
,
periodicBoxSize
,
invPeriodicBoxSize
,
posq
,
blockCenter
,
blockBoundingBox
,
maxTiles
);
}
/**
* Compare each atom in one block to the bounding box of another block, and set
* flags for which ones are interacting.
*/
extern
"C"
__global__
void
findInteractionsWithinBlocks
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
real4
*
__restrict__
posq
,
const
ushort2
*
__restrict__
tiles
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockBoundingBox
,
unsigned
int
*
__restrict__
interactionFlags
,
const
unsigned
int
*
__restrict__
interactionCount
,
unsigned
int
maxTiles
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
unsigned
int
numTiles
=
interactionCount
[
0
];
unsigned
int
pos
=
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
unsigned
int
index
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
#if (__CUDA_ARCH__ < 200)
__shared__
unsigned
int
flags
[
128
];
#endif
if
(
numTiles
>
maxTiles
)
return
;
unsigned
int
lasty
=
0xFFFFFFFF
;
real4
apos
;
while
(
pos
<
end
)
{
// Extract the coordinates of this tile
ushort2
tileIndices
=
tiles
[
pos
];
unsigned
int
x
=
tileIndices
.
x
;
unsigned
int
y
=
tileIndices
.
y
;
if
(
x
==
y
)
{
if
(
index
==
0
)
interactionFlags
[
pos
]
=
0xFFFFFFFF
;
}
else
{
// Load the bounding box for x and the atom positions for y.
real4
center
=
blockCenter
[
x
];
real4
boxSize
=
blockBoundingBox
[
x
];
if
(
y
!=
lasty
)
apos
=
posq
[
y
*
TILE_SIZE
+
index
];
// Find the distance of the atom from the bounding box.
real4
delta
=
apos
-
center
;
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
delta
.
x
=
max
((
real
)
0
,
fabs
(
delta
.
x
)
-
boxSize
.
x
);
delta
.
y
=
max
((
real
)
0
,
fabs
(
delta
.
y
)
-
boxSize
.
y
);
delta
.
z
=
max
((
real
)
0
,
fabs
(
delta
.
z
)
-
boxSize
.
z
);
#if (__CUDA_ARCH__ < 200)
flags
[
threadIdx
.
x
]
=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
>
CUTOFF_SQUARED
?
0
:
1
<<
index
);
if
(
index
%
4
==
0
)
flags
[
threadIdx
.
x
]
+=
flags
[
threadIdx
.
x
+
1
]
+
flags
[
threadIdx
.
x
+
2
]
+
flags
[
threadIdx
.
x
+
3
];
unsigned
int
allFlags
=
0
;
if
(
index
==
0
)
allFlags
=
flags
[
threadIdx
.
x
]
+
flags
[
threadIdx
.
x
+
4
]
+
flags
[
threadIdx
.
x
+
8
]
+
flags
[
threadIdx
.
x
+
12
]
+
flags
[
threadIdx
.
x
+
16
]
+
flags
[
threadIdx
.
x
+
20
]
+
flags
[
threadIdx
.
x
+
24
]
+
flags
[
threadIdx
.
x
+
28
];
#else
unsigned
int
allFlags
=
__ballot
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
CUTOFF_SQUARED
);
#endif
// Sum the flags.
if
(
index
==
0
)
{
// Count how many flags are set, and based on that decide whether to compute all interactions
// or only a fraction of them.
int
bits
=
__popc
(
allFlags
);
interactionFlags
[
pos
]
=
(
bits
>
12
?
0xFFFFFFFF
:
allFlags
);
if
(
bufferFull
)
{
storeInteractionData
(
x
,
buffer
,
sum
,
temp
,
atoms
,
numAtoms
,
globalIndex
,
interactionCount
,
interactingTiles
,
interactingAtoms
,
periodicBoxSize
,
invPeriodicBoxSize
,
posq
,
posBuffer
,
blockCenterX
,
blockSizeX
,
maxTiles
,
false
);
valuesInBuffer
=
0
;
if
(
threadIdx
.
x
==
0
)
bufferFull
=
false
;
__syncthreads
();
}
lasty
=
y
;
}
pos
++
;
storeInteractionData
(
x
,
buffer
,
sum
,
temp
,
atoms
,
numAtoms
,
globalIndex
,
interactionCount
,
interactingTiles
,
interactingAtoms
,
periodicBoxSize
,
invPeriodicBoxSize
,
posq
,
posBuffer
,
blockCenterX
,
blockSizeX
,
maxTiles
,
true
)
;
}
// Record the positions the neighbor list is based on.
for
(
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
oldPositions
[
i
]
=
posq
[
i
];
}
platforms/cuda/src/kernels/gbsaObc1.cu
View file @
93c467b2
#define DIELECTRIC_OFFSET 0.009f
#define PROBE_RADIUS 0.14f
#define SURFACE_AREA_FACTOR -170.351730667551f //-6.0f*3.14159265358979323846f*0.0216f*1000.0f*0.4184f;
#define TILE_SIZE 32
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
/**
...
...
@@ -70,262 +69,332 @@ typedef struct {
*/
extern
"C"
__global__
void
computeBornSum
(
unsigned
long
long
*
__restrict__
global_bornSum
,
const
real4
*
__restrict__
posq
,
const
float2
*
__restrict__
global_params
,
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interacti
onFlag
s
,
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
unsigned
int
*
__restrict__
interacti
ngAtom
s
,
#else
unsigned
int
numTiles
,
#endif
unsigned
int
*
exclusionIndices
,
unsigned
int
*
exclusionRowIndices
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
ushort2
*
__restrict__
exclusionTiles
)
{
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
__shared__
AtomData1
localData
[
FORCE_WORK_GROUP_SIZE
];
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
real
bornSum
=
0
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
float2
params1
=
global_params
[
atom1
];
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
x
=
posq1
.
x
;
localData
[
threadIdx
.
x
].
y
=
posq1
.
y
;
localData
[
threadIdx
.
x
].
z
=
posq1
.
z
;
localData
[
threadIdx
.
x
].
q
=
posq1
.
w
;
localData
[
threadIdx
.
x
].
radius
=
params1
.
x
;
localData
[
threadIdx
.
x
].
scaledRadius
=
params1
.
y
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
j
].
x
-
posq1
.
x
,
localData
[
tbx
+
j
].
y
-
posq1
.
y
,
localData
[
tbx
+
j
].
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
float2
params2
=
make_float2
(
localData
[
tbx
+
j
].
radius
,
localData
[
tbx
+
j
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
((
j
!=
tgx
)
&&
(
params1
.
x
<
rScaledRadiusJ
))
{
real
l_ij
=
RECIP
(
max
(
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusJ
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
bornSum
+=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params2
.
y
*
params2
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
bornSum
+=
(
params1
.
x
<
params2
.
y
-
r
?
2.0
f
*
(
RECIP
(
params1
.
x
)
-
l_ij
)
:
0
);
}
}
}
}
else
{
// This is an off-diagonal tile.
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
real4
tempPosq
=
posq
[
j
];
localData
[
threadIdx
.
x
].
x
=
tempPosq
.
x
;
localData
[
threadIdx
.
x
].
y
=
tempPosq
.
y
;
localData
[
threadIdx
.
x
].
z
=
tempPosq
.
z
;
localData
[
threadIdx
.
x
].
q
=
tempPosq
.
w
;
float2
tempParams
=
global_params
[
j
];
localData
[
threadIdx
.
x
].
radius
=
tempParams
.
x
;
localData
[
threadIdx
.
x
].
scaledRadius
=
tempParams
.
y
;
localData
[
threadIdx
.
x
].
bornSum
=
0.0
f
;
// Compute the full set of interactions in this tile.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
tj
].
x
-
posq1
.
x
,
localData
[
tbx
+
tj
].
y
-
posq1
.
y
,
localData
[
tbx
+
tj
].
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
tj
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
tj
<
NUM_ATOMS
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
float2
params2
=
make_float2
(
localData
[
tbx
+
tj
].
radius
,
localData
[
tbx
+
tj
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
(
params1
.
x
<
rScaledRadiusJ
)
{
real
l_ij
=
RECIP
(
max
(
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusJ
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
bornSum
+=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params2
.
y
*
params2
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
bornSum
+=
(
params1
.
x
<
params2
.
y
-
r
?
2.0
f
*
(
RECIP
(
params1
.
x
)
-
l_ij
)
:
0
);
}
real
rScaledRadiusI
=
r
+
params1
.
y
;
if
(
params2
.
x
<
rScaledRadiusI
)
{
real
l_ij
=
RECIP
(
max
(
params2
.
x
,
fabs
(
r
-
params1
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusI
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
real
term
=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params1
.
y
*
params1
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
term
+=
(
params2
.
x
<
params1
.
y
-
r
?
2.0
f
*
(
RECIP
(
params2
.
x
)
-
l_ij
)
:
0
);
localData
[
tbx
+
tj
].
bornSum
+=
term
;
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
// Write results.
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
global_bornSum
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
bornSum
*
0x100000000
)));
if
(
x
!=
y
)
{
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
global_bornSum
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
bornSum
*
0x100000000
)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
unsigned
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
unsigned
int
pos
=
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
int
pos
=
warp
*
numTiles
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
unsigned
int
lasty
=
0xFFFFFFFF
;
__shared__
AtomData1
localData
[
FORCE_WORK_GROUP_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
#ifndef ENABLE_SHUFFLE
__shared__
real
tempBuffer
[
FORCE_WORK_GROUP_SIZE
];
#endif
do
{
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
unsigned
int
x
,
y
;
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
atomIndices
[
FORCE_WORK_GROUP_SIZE
];
__shared__
int
skipTiles
[
FORCE_WORK_GROUP_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
while
(
pos
<
end
)
{
real
bornSum
=
0
;
if
(
pos
<
end
)
{
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
unsigned
int
x
,
y
;
bool
singlePeriodicCopy
=
false
;
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
singlePeriodicCop
y
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
sqrt
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
real4
posq1
=
posq
[
atom1
];
float2
params1
=
global_params
[
atom1
];
if
(
pos
>=
end
)
;
// This warp is done.
else
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
x
=
posq1
.
x
;
localData
[
threadIdx
.
x
].
y
=
posq1
.
y
;
localData
[
threadIdx
.
x
].
z
=
posq1
.
z
;
localData
[
threadIdx
.
x
].
q
=
posq1
.
w
;
localData
[
threadIdx
.
x
].
radius
=
params1
.
x
;
localData
[
threadIdx
.
x
].
scaledRadius
=
params1
.
y
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
j
].
x
-
posq1
.
x
,
localData
[
tbx
+
j
].
y
-
posq1
.
y
,
localData
[
tbx
+
j
].
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
#else
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
)
{
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
real4
tempPosq
=
posq
[
j
];
localData
[
threadIdx
.
x
].
x
=
tempPosq
.
x
;
localData
[
threadIdx
.
x
].
y
=
tempPosq
.
y
;
localData
[
threadIdx
.
x
].
z
=
tempPosq
.
z
;
localData
[
threadIdx
.
x
].
q
=
tempPosq
.
w
;
float2
tempParams
=
global_params
[
j
];
localData
[
threadIdx
.
x
].
radius
=
tempParams
.
x
;
localData
[
threadIdx
.
x
].
scaledRadius
=
tempParams
.
y
;
localData
[
threadIdx
.
x
].
bornSum
=
0.0
f
;
}
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
localData
[
threadIdx
.
x
].
x
-=
floor
((
localData
[
threadIdx
.
x
].
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
localData
[
threadIdx
.
x
].
y
-=
floor
((
localData
[
threadIdx
.
x
].
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
localData
[
threadIdx
.
x
].
z
-=
floor
((
localData
[
threadIdx
.
x
].
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
tj
].
x
-
posq1
.
x
,
localData
[
tbx
+
tj
].
y
-
posq1
.
y
,
localData
[
tbx
+
tj
].
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
int
atom2
=
atomIndices
[
tbx
+
tj
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
float2
params2
=
make_float2
(
localData
[
tbx
+
j
].
radius
,
localData
[
tbx
+
j
].
scaledRadius
);
float2
params2
=
make_float2
(
localData
[
tbx
+
t
j
].
radius
,
localData
[
tbx
+
t
j
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
((
j
!=
tgx
)
&&
(
params1
.
x
<
rScaledRadiusJ
)
)
{
if
(
params1
.
x
<
rScaledRadiusJ
)
{
real
l_ij
=
RECIP
(
max
(
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusJ
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
bornSum
+=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params2
.
y
*
params2
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
if
(
params1
.
x
<
params2
.
y
-
r
)
bornSum
+=
2.0
f
*
(
RECIP
(
params1
.
x
)
-
l_ij
);
(
params2
.
y
*
params2
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
bornSum
+=
(
params1
.
x
<
params2
.
y
-
r
?
2.0
f
*
(
RECIP
(
params1
.
x
)
-
l_ij
)
:
0
);
}
real
rScaledRadiusI
=
r
+
params1
.
y
;
if
(
params2
.
x
<
rScaledRadiusI
)
{
real
l_ij
=
RECIP
(
max
(
params2
.
x
,
fabs
(
r
-
params1
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusI
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
real
term
=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params1
.
y
*
params1
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
term
+=
(
params2
.
x
<
params1
.
y
-
r
?
2.0
f
*
(
RECIP
(
params2
.
x
)
-
l_ij
)
:
0
);
localData
[
tbx
+
tj
].
bornSum
+=
term
;
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
else
{
// This is an off-diagonal tile.
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
real4
tempPosq
=
posq
[
j
];
localData
[
threadIdx
.
x
].
x
=
tempPosq
.
x
;
localData
[
threadIdx
.
x
].
y
=
tempPosq
.
y
;
localData
[
threadIdx
.
x
].
z
=
tempPosq
.
z
;
localData
[
threadIdx
.
x
].
q
=
tempPosq
.
w
;
float2
tempParams
=
global_params
[
j
];
localData
[
threadIdx
.
x
].
radius
=
tempParams
.
x
;
localData
[
threadIdx
.
x
].
scaledRadius
=
tempParams
.
y
;
}
localData
[
threadIdx
.
x
].
bornSum
=
0.0
f
;
#ifdef USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
bool
computeSubset
=
false
;
if
(
flags
!=
0xFFFFFFFF
)
{
if
(
tgx
<
2
)
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
computeSubset
=
(
exclusionIndex
[
localGroupIndex
]
==
-
1
);
}
if
(
computeSubset
)
{
if
(
flags
==
0
)
{
// No interactions in this tile.
}
else
{
// Compute only a subset of the interactions in this tile.
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
j
].
x
-
posq1
.
x
,
localData
[
tbx
+
j
].
y
-
posq1
.
y
,
localData
[
tbx
+
j
].
z
-
posq1
.
z
);
unsigned
int
t
j
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
t
j
].
x
-
posq1
.
x
,
localData
[
tbx
+
t
j
].
y
-
posq1
.
y
,
localData
[
tbx
+
t
j
].
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
sum
=
0
;
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
int
atom2
=
atomIndices
[
tbx
+
tj
]
;
#ifdef USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
float2
params2
=
make_float2
(
localData
[
tbx
+
j
].
radius
,
localData
[
tbx
+
j
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
(
params1
.
x
<
rScaledRadiusJ
)
{
real
l_ij
=
RECIP
(
max
(
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusJ
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
bornSum
+=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params2
.
y
*
params2
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
if
(
params1
.
x
<
params2
.
y
-
r
)
bornSum
+=
2.0
f
*
(
RECIP
(
params1
.
x
)
-
l_ij
);
}
real
rScaledRadiusI
=
r
+
params1
.
y
;
if
(
params2
.
x
<
rScaledRadiusI
)
{
real
l_ij
=
RECIP
(
max
(
params2
.
x
,
fabs
(
r
-
params1
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusI
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
real
term
=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params1
.
y
*
params1
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
if
(
params2
.
x
<
params1
.
y
-
r
)
term
+=
2.0
f
*
(
RECIP
(
params2
.
x
)
-
l_ij
);
sum
=
term
;
}
}
// Sum the forces on atom j.
#ifdef ENABLE_SHUFFLE
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
sum
+=
__shfl_xor
(
sum
,
i
,
32
);
if
(
tgx
==
0
)
localData
[
tbx
+
j
].
bornSum
+=
sum
;
#else
tempBuffer
[
threadIdx
.
x
]
=
sum
;
if
(
tgx
%
4
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
1
]
+
tempBuffer
[
threadIdx
.
x
+
2
]
+
tempBuffer
[
threadIdx
.
x
+
3
];
if
(
tgx
==
0
)
localData
[
tbx
+
j
].
bornSum
+=
tempBuffer
[
threadIdx
.
x
]
+
tempBuffer
[
threadIdx
.
x
+
4
]
+
tempBuffer
[
threadIdx
.
x
+
8
]
+
tempBuffer
[
threadIdx
.
x
+
12
]
+
tempBuffer
[
threadIdx
.
x
+
16
]
+
tempBuffer
[
threadIdx
.
x
+
20
]
+
tempBuffer
[
threadIdx
.
x
+
24
]
+
tempBuffer
[
threadIdx
.
x
+
28
];
#endif
}
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
float2
params2
=
make_float2
(
localData
[
tbx
+
tj
].
radius
,
localData
[
tbx
+
tj
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
(
params1
.
x
<
rScaledRadiusJ
)
{
real
l_ij
=
RECIP
(
max
(
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusJ
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
bornSum
+=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params2
.
y
*
params2
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
bornSum
+=
(
params1
.
x
<
params2
.
y
-
r
?
2.0
f
*
(
RECIP
(
params1
.
x
)
-
l_ij
)
:
0
);
}
real
rScaledRadiusI
=
r
+
params1
.
y
;
if
(
params2
.
x
<
rScaledRadiusI
)
{
real
l_ij
=
RECIP
(
max
(
params2
.
x
,
fabs
(
r
-
params1
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusI
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
real
term
=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params1
.
y
*
params1
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
term
+=
(
params2
.
x
<
params1
.
y
-
r
?
2.0
f
*
(
RECIP
(
params2
.
x
)
-
l_ij
)
:
0
);
localData
[
tbx
+
tj
].
bornSum
+=
term
;
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
else
#endif
{
// Compute the full set of interactions in this tile.
}
// Write results.
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
tj
].
x
-
posq1
.
x
,
localData
[
tbx
+
tj
].
y
-
posq1
.
y
,
localData
[
tbx
+
tj
].
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
atomicAdd
(
&
global_bornSum
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
bornSum
*
0x100000000
)));
#ifdef USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
tj
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
unsigned
int
atom2
=
atomIndices
[
threadIdx
.
x
];
#else
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
tj
<
NUM_ATOMS
)
{
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
float2
params2
=
make_float2
(
localData
[
tbx
+
tj
].
radius
,
localData
[
tbx
+
tj
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
(
params1
.
x
<
rScaledRadiusJ
)
{
real
l_ij
=
RECIP
(
max
(
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusJ
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
bornSum
+=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params2
.
y
*
params2
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
if
(
params1
.
x
<
params2
.
y
-
r
)
bornSum
+=
2.0
f
*
(
RECIP
(
params1
.
x
)
-
l_ij
);
}
real
rScaledRadiusI
=
r
+
params1
.
y
;
if
(
params2
.
x
<
rScaledRadiusI
)
{
real
l_ij
=
RECIP
(
max
(
params2
.
x
,
fabs
(
r
-
params1
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusI
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
real
term
=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params1
.
y
*
params1
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
if
(
params2
.
x
<
params1
.
y
-
r
)
term
+=
2.0
f
*
(
RECIP
(
params2
.
x
)
-
l_ij
);
localData
[
tbx
+
tj
].
bornSum
+=
term
;
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
if
(
atom2
<
PADDED_NUM_ATOMS
)
atomicAdd
(
&
global_bornSum
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
bornSum
*
0x100000000
)));
}
// Write results.
if
(
pos
<
end
)
{
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
global_bornSum
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
bornSum
*
0x100000000
)));
}
if
(
pos
<
end
&&
x
!=
y
)
{
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
global_bornSum
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
bornSum
*
0x100000000
)));
}
lasty
=
y
;
pos
++
;
}
while
(
pos
<
end
);
}
}
typedef
struct
{
...
...
@@ -342,77 +411,50 @@ typedef struct {
extern
"C"
__global__
void
computeGBSAForce1
(
unsigned
long
long
*
__restrict__
forceBuffers
,
unsigned
long
long
*
__restrict__
global_bornForce
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
global_bornRadii
,
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interacti
onFlag
s
,
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
unsigned
int
*
__restrict__
interacti
ngAtom
s
,
#else
unsigned
int
numTiles
,
#endif
unsigned
int
*
exclusionIndices
,
unsigned
int
*
exclusionRowIndices
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
unsigned
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
unsigned
int
pos
=
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
const
ushort2
*
__restrict__
exclusionTiles
)
{
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__shared__
AtomData2
localData
[
FORCE_WORK_GROUP_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
#ifndef ENABLE_SHUFFLE
__shared__
real4
tempBuffer
[
FORCE_WORK_GROUP_SIZE
];
#endif
do
{
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
unsigned
int
x
,
y
;
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
real4
force
=
make_real4
(
0
);
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
sqrt
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
}
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
real
bornRadius1
=
global_bornRadii
[
atom1
];
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
x
=
posq1
.
x
;
localData
[
threadIdx
.
x
].
y
=
posq1
.
y
;
localData
[
threadIdx
.
x
].
z
=
posq1
.
z
;
localData
[
threadIdx
.
x
].
q
=
posq1
.
w
;
localData
[
threadIdx
.
x
].
bornRadius
=
bornRadius1
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
)
{
real4
posq2
=
make_real4
(
localData
[
tbx
+
j
].
x
,
localData
[
tbx
+
j
].
y
,
localData
[
tbx
+
j
].
z
,
localData
[
tbx
+
j
].
q
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
real
bornRadius1
=
global_bornRadii
[
atom1
];
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
x
=
posq1
.
x
;
localData
[
threadIdx
.
x
].
y
=
posq1
.
y
;
localData
[
threadIdx
.
x
].
z
=
posq1
.
z
;
localData
[
threadIdx
.
x
].
q
=
posq1
.
w
;
localData
[
threadIdx
.
x
].
bornRadius
=
bornRadius1
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
)
{
real4
posq2
=
make_real4
(
localData
[
tbx
+
j
].
x
,
localData
[
tbx
+
j
].
y
,
localData
[
tbx
+
j
].
z
,
localData
[
tbx
+
j
].
q
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
...
...
@@ -433,143 +475,233 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
#ifdef USE_CUTOFF
}
#endif
}
#endif
}
}
else
{
// This is an off-diagonal tile.
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
real4
tempPosq
=
posq
[
j
];
localData
[
threadIdx
.
x
].
x
=
tempPosq
.
x
;
localData
[
threadIdx
.
x
].
y
=
tempPosq
.
y
;
localData
[
threadIdx
.
x
].
z
=
tempPosq
.
z
;
localData
[
threadIdx
.
x
].
q
=
tempPosq
.
w
;
localData
[
threadIdx
.
x
].
bornRadius
=
global_bornRadii
[
j
];
}
localData
[
threadIdx
.
x
].
fx
=
0.0
f
;
localData
[
threadIdx
.
x
].
fy
=
0.0
f
;
localData
[
threadIdx
.
x
].
fz
=
0.0
f
;
localData
[
threadIdx
.
x
].
fw
=
0.0
f
;
#ifdef USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
bool
computeSubset
=
false
;
if
(
flags
!=
0xFFFFFFFF
)
{
if
(
tgx
<
2
)
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
computeSubset
=
(
exclusionIndex
[
localGroupIndex
]
==
-
1
);
}
if
(
computeSubset
)
{
if
(
flags
==
0
)
{
// No interactions in this tile.
}
else
{
// Compute only a subset of the interactions in this tile.
}
else
{
// This is an off-diagonal tile.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
real4
posq2
=
make_real4
(
localData
[
tbx
+
j
].
x
,
localData
[
tbx
+
j
].
y
,
localData
[
tbx
+
j
].
z
,
localData
[
tbx
+
j
].
q
);
real4
delta
=
make_real4
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
,
0
);
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
real4
tempPosq
=
posq
[
j
];
localData
[
threadIdx
.
x
].
x
=
tempPosq
.
x
;
localData
[
threadIdx
.
x
].
y
=
tempPosq
.
y
;
localData
[
threadIdx
.
x
].
z
=
tempPosq
.
z
;
localData
[
threadIdx
.
x
].
q
=
tempPosq
.
w
;
localData
[
threadIdx
.
x
].
bornRadius
=
global_bornRadii
[
j
];
localData
[
threadIdx
.
x
].
fx
=
0.0
f
;
localData
[
threadIdx
.
x
].
fy
=
0.0
f
;
localData
[
threadIdx
.
x
].
fz
=
0.0
f
;
localData
[
threadIdx
.
x
].
fw
=
0.0
f
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
tj
<
NUM_ATOMS
)
{
real4
posq2
=
make_real4
(
localData
[
tbx
+
tj
].
x
,
localData
[
tbx
+
tj
].
y
,
localData
[
tbx
+
tj
].
z
,
localData
[
tbx
+
tj
].
q
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
real
bornRadius2
=
localData
[
tbx
+
tj
].
bornRadius
;
real
alpha2_ij
=
bornRadius1
*
bornRadius2
;
real
D_ij
=
r2
*
RECIP
(
4.0
f
*
alpha2_ij
);
real
expTerm
=
EXP
(
-
D_ij
);
real
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
real
denominator
=
SQRT
(
denominator2
);
real
tempEnergy
=
(
PREFACTOR
*
posq1
.
w
*
posq2
.
w
)
*
RECIP
(
denominator
);
real
Gpol
=
tempEnergy
*
RECIP
(
denominator2
);
real
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
real
dEdR
=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
force
.
w
+=
dGpol_dalpha2_ij
*
bornRadius2
;
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
localData
[
tbx
+
tj
].
fw
+=
dGpol_dalpha2_ij
*
bornRadius1
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
}
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
real
bornRadius2
=
localData
[
tbx
+
j
].
bornRadius
;
real
alpha2_ij
=
bornRadius1
*
bornRadius2
;
real
D_ij
=
r2
*
RECIP
(
4.0
f
*
alpha2_ij
);
real
expTerm
=
EXP
(
-
D_ij
);
real
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
real
denominator
=
SQRT
(
denominator2
);
real
tempEnergy
=
(
PREFACTOR
*
posq1
.
w
*
posq2
.
w
)
*
RECIP
(
denominator
);
real
Gpol
=
tempEnergy
*
RECIP
(
denominator2
);
real
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
real
dEdR
=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
// Write results.
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
atomicAdd
(
&
global_bornForce
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
w
*
0x100000000
)));
if
(
x
!=
y
)
{
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0x100000000
)));
atomicAdd
(
&
global_bornForce
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fw
*
0x100000000
)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
if
(
atom1
>=
NUM_ATOMS
||
y
*
TILE_SIZE
+
j
>=
NUM_ATOMS
||
r2
>
CUTOFF_SQUARED
)
{
unsigned
int
numTiles
=
interactionCount
[
0
];
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
if
(
atom1
>=
NUM_ATOMS
||
y
*
TILE_SIZE
+
j
>=
NUM_ATOMS
)
{
int
pos
=
warp
*
numTiles
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
dEdR
=
0.0
f
;
dGpol_dalpha2_ij
=
0.0
f
;
tempEnergy
=
0.0
f
;
}
energy
+=
tempEnergy
;
force
.
w
+=
dGpol_dalpha2_ij
*
bornRadius2
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
delta
.
w
=
dGpol_dalpha2_ij
*
bornRadius1
;
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
atomIndices
[
FORCE_WORK_GROUP_SIZE
];
__shared__
int
skipTiles
[
FORCE_WORK_GROUP_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
while
(
pos
<
end
)
{
real4
force
=
make_real4
(
0
);
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
unsigned
int
x
,
y
;
bool
singlePeriodicCopy
=
false
;
#ifdef USE_CUTOFF
}
else
delta
=
make_real4
(
0
);
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
singlePeriodicCopy
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Sum the forces on atom j.
#ifdef ENABLE_SHUFFLE
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
{
delta
.
x
+=
__shfl_xor
(
delta
.
x
,
i
,
32
);
delta
.
y
+=
__shfl_xor
(
delta
.
y
,
i
,
32
);
delta
.
z
+=
__shfl_xor
(
delta
.
z
,
i
,
32
);
delta
.
w
+=
__shfl_xor
(
delta
.
w
,
i
,
32
);
}
if
(
tgx
==
0
)
{
localData
[
tbx
+
j
].
fx
+=
delta
.
x
;
localData
[
tbx
+
j
].
fy
+=
delta
.
y
;
localData
[
tbx
+
j
].
fz
+=
delta
.
z
;
localData
[
tbx
+
j
].
fw
+=
delta
.
w
;
}
// Load atom data for this tile.
real4
posq1
=
posq
[
atom1
];
real
bornRadius1
=
global_bornRadii
[
atom1
];
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
#else
tempBuffer
[
threadIdx
.
x
]
=
delta
;
if
(
tgx
%
4
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
1
]
+
tempBuffer
[
threadIdx
.
x
+
2
]
+
tempBuffer
[
threadIdx
.
x
+
3
];
if
(
tgx
==
0
)
{
real4
sum
=
tempBuffer
[
threadIdx
.
x
]
+
tempBuffer
[
threadIdx
.
x
+
4
]
+
tempBuffer
[
threadIdx
.
x
+
8
]
+
tempBuffer
[
threadIdx
.
x
+
12
]
+
tempBuffer
[
threadIdx
.
x
+
16
]
+
tempBuffer
[
threadIdx
.
x
+
20
]
+
tempBuffer
[
threadIdx
.
x
+
24
]
+
tempBuffer
[
threadIdx
.
x
+
28
];
localData
[
tbx
+
j
].
fx
+=
sum
.
x
;
localData
[
tbx
+
j
].
fy
+=
sum
.
y
;
localData
[
tbx
+
j
].
fz
+=
sum
.
z
;
localData
[
tbx
+
j
].
fw
+=
sum
.
w
;
}
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
}
atomIndices
[
threadIdx
.
x
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
real4
tempPosq
=
posq
[
j
];
localData
[
threadIdx
.
x
].
x
=
tempPosq
.
x
;
localData
[
threadIdx
.
x
].
y
=
tempPosq
.
y
;
localData
[
threadIdx
.
x
].
z
=
tempPosq
.
z
;
localData
[
threadIdx
.
x
].
q
=
tempPosq
.
w
;
localData
[
threadIdx
.
x
].
bornRadius
=
global_bornRadii
[
j
];
localData
[
threadIdx
.
x
].
fx
=
0.0
f
;
localData
[
threadIdx
.
x
].
fy
=
0.0
f
;
localData
[
threadIdx
.
x
].
fz
=
0.0
f
;
localData
[
threadIdx
.
x
].
fw
=
0.0
f
;
}
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
localData
[
threadIdx
.
x
].
x
-=
floor
((
localData
[
threadIdx
.
x
].
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
localData
[
threadIdx
.
x
].
y
-=
floor
((
localData
[
threadIdx
.
x
].
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
localData
[
threadIdx
.
x
].
z
-=
floor
((
localData
[
threadIdx
.
x
].
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
atomIndices
[
tbx
+
tj
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real4
posq2
=
make_real4
(
localData
[
tbx
+
tj
].
x
,
localData
[
tbx
+
tj
].
y
,
localData
[
tbx
+
tj
].
z
,
localData
[
tbx
+
tj
].
q
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
real
bornRadius2
=
localData
[
tbx
+
tj
].
bornRadius
;
real
alpha2_ij
=
bornRadius1
*
bornRadius2
;
real
D_ij
=
r2
*
RECIP
(
4.0
f
*
alpha2_ij
);
real
expTerm
=
EXP
(
-
D_ij
);
real
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
real
denominator
=
SQRT
(
denominator2
);
real
tempEnergy
=
(
PREFACTOR
*
posq1
.
w
*
posq2
.
w
)
*
RECIP
(
denominator
);
real
Gpol
=
tempEnergy
*
RECIP
(
denominator2
);
real
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
real
dEdR
=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
force
.
w
+=
dGpol_dalpha2_ij
*
bornRadius2
;
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
localData
[
tbx
+
tj
].
fw
+=
dGpol_dalpha2_ij
*
bornRadius1
;
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
else
}
else
#endif
{
// Compute the full set of interactions in this tile.
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
tj
<
NUM_ATOMS
)
{
real4
posq2
=
make_real4
(
localData
[
tbx
+
tj
].
x
,
localData
[
tbx
+
tj
].
y
,
localData
[
tbx
+
tj
].
z
,
localData
[
tbx
+
tj
].
q
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
atomIndices
[
tbx
+
tj
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real4
posq2
=
make_real4
(
localData
[
tbx
+
tj
].
x
,
localData
[
tbx
+
tj
].
y
,
localData
[
tbx
+
tj
].
z
,
localData
[
tbx
+
tj
].
q
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
...
...
@@ -594,33 +726,32 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
localData
[
tbx
+
tj
].
fw
+=
dGpol_dalpha2_ij
*
bornRadius1
;
#ifdef USE_CUTOFF
}
#endif
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
#endif
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
// Write results.
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
atomicAdd
(
&
global_bornForce
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
w
*
0x100000000
)));
#ifdef USE_CUTOFF
unsigned
int
atom2
=
atomIndices
[
threadIdx
.
x
];
#else
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
#endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0x100000000
)));
atomicAdd
(
&
global_bornForce
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fw
*
0x100000000
)));
}
}
// Write results.
if
(
pos
<
end
)
{
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
atomicAdd
(
&
global_bornForce
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
w
*
0x100000000
)));
}
if
(
pos
<
end
&&
x
!=
y
)
{
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0x100000000
)));
atomicAdd
(
&
global_bornForce
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fw
*
0x100000000
)));
}
lasty
=
y
;
pos
++
;
}
while
(
pos
<
end
);
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
platforms/cuda/src/kernels/integrationUtilities.cu
View file @
93c467b2
...
...
@@ -24,14 +24,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
x1
=
sqrt
(
-
2.0
f
*
log
(
x1
));
x1
=
SQRT
(
-
2.0
f
*
LOG
(
x1
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x2
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
value
.
x
=
x1
*
cos
(
2.0
f
*
3.14159265
f
*
x2
);
value
.
x
=
x1
*
COS
(
2.0
f
*
3.14159265
f
*
x2
);
// Generate second value.
...
...
@@ -49,14 +49,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
x3
=
sqrt
(
-
2.0
f
*
log
(
x3
));
x3
=
SQRT
(
-
2.0
f
*
LOG
(
x3
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x4
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
value
.
y
=
x3
*
cos
(
2.0
f
*
3.14159265
f
*
x4
);
value
.
y
=
x3
*
COS
(
2.0
f
*
3.14159265
f
*
x4
);
// Generate third value.
...
...
@@ -74,14 +74,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
x5
=
sqrt
(
-
2.0
f
*
log
(
x5
));
x5
=
SQRT
(
-
2.0
f
*
LOG
(
x5
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x6
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
value
.
z
=
x5
*
cos
(
2.0
f
*
3.14159265
f
*
x6
);
value
.
z
=
x5
*
COS
(
2.0
f
*
3.14159265
f
*
x6
);
// Generate fourth value.
...
...
@@ -99,14 +99,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
x7
=
sqrt
(
-
2.0
f
*
log
(
x7
));
x7
=
SQRT
(
-
2.0
f
*
LOG
(
x7
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x8
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
value
.
w
=
x7
*
cos
(
2.0
f
*
3.14159265
f
*
x8
);
value
.
w
=
x7
*
COS
(
2.0
f
*
3.14159265
f
*
x8
);
// Record the values.
...
...
@@ -412,9 +412,9 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed
yaksYd
=
zaksZd
*
xaksXd
-
xaksZd
*
zaksXd
;
mixed
zaksYd
=
xaksZd
*
yaksXd
-
yaksZd
*
xaksXd
;
mixed
axlng
=
sqrt
(
xaksXd
*
xaksXd
+
yaksXd
*
yaksXd
+
zaksXd
*
zaksXd
);
mixed
aylng
=
sqrt
(
xaksYd
*
xaksYd
+
yaksYd
*
yaksYd
+
zaksYd
*
zaksYd
);
mixed
azlng
=
sqrt
(
xaksZd
*
xaksZd
+
yaksZd
*
yaksZd
+
zaksZd
*
zaksZd
);
mixed
axlng
=
SQRT
(
xaksXd
*
xaksXd
+
yaksXd
*
yaksXd
+
zaksXd
*
zaksXd
);
mixed
aylng
=
SQRT
(
xaksYd
*
xaksYd
+
yaksYd
*
yaksYd
+
zaksYd
*
zaksYd
);
mixed
azlng
=
SQRT
(
xaksZd
*
xaksZd
+
yaksZd
*
yaksZd
+
zaksZd
*
zaksZd
);
mixed
trns11
=
xaksXd
/
axlng
;
mixed
trns21
=
yaksXd
/
axlng
;
mixed
trns31
=
zaksXd
/
axlng
;
...
...
@@ -440,13 +440,13 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
// --- Step2 A2' ---
float
rc
=
0.5
f
*
params
.
y
;
mixed
rb
=
sqrt
(
params
.
x
*
params
.
x
-
rc
*
rc
);
mixed
rb
=
SQRT
(
params
.
x
*
params
.
x
-
rc
*
rc
);
mixed
ra
=
rb
*
(
m1
+
m2
)
*
invTotalMass
;
rb
-=
ra
;
mixed
sinphi
=
za1d
/
ra
;
mixed
cosphi
=
sqrt
(
1
-
sinphi
*
sinphi
);
mixed
cosphi
=
SQRT
(
1
-
sinphi
*
sinphi
);
mixed
sinpsi
=
(
zb1d
-
zc1d
)
/
(
2
*
rc
*
cosphi
);
mixed
cospsi
=
sqrt
(
1
-
sinpsi
*
sinpsi
);
mixed
cospsi
=
SQRT
(
1
-
sinpsi
*
sinpsi
);
mixed
ya2d
=
ra
*
cosphi
;
mixed
xb2d
=
-
rc
*
cospsi
;
...
...
@@ -454,7 +454,7 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed
yc2d
=
-
rb
*
cosphi
+
rc
*
sinpsi
*
sinphi
;
mixed
xb2d2
=
xb2d
*
xb2d
;
mixed
hh2
=
4.0
f
*
xb2d2
+
(
yb2d
-
yc2d
)
*
(
yb2d
-
yc2d
)
+
(
zb1d
-
zc1d
)
*
(
zb1d
-
zc1d
);
mixed
deltx
=
2.0
f
*
xb2d
+
sqrt
(
4.0
f
*
xb2d2
-
hh2
+
params
.
y
*
params
.
y
);
mixed
deltx
=
2.0
f
*
xb2d
+
SQRT
(
4.0
f
*
xb2d2
-
hh2
+
params
.
y
*
params
.
y
);
xb2d
-=
deltx
*
0.5
f
;
// --- Step3 al,be,ga ---
...
...
@@ -464,11 +464,11 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed
gamma
=
xb0d
*
yb1d
-
xb1d
*
yb0d
+
xc0d
*
yc1d
-
xc1d
*
yc0d
;
mixed
al2be2
=
alpha
*
alpha
+
beta
*
beta
;
mixed
sintheta
=
(
alpha
*
gamma
-
beta
*
sqrt
(
al2be2
-
gamma
*
gamma
))
/
al2be2
;
mixed
sintheta
=
(
alpha
*
gamma
-
beta
*
SQRT
(
al2be2
-
gamma
*
gamma
))
/
al2be2
;
// --- Step4 A3' ---
mixed
costheta
=
sqrt
(
1
-
sintheta
*
sintheta
);
mixed
costheta
=
SQRT
(
1
-
sintheta
*
sintheta
);
mixed
xa3d
=
-
ya2d
*
sintheta
;
mixed
ya3d
=
ya2d
*
costheta
;
mixed
za3d
=
za1d
;
...
...
@@ -534,9 +534,9 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
mixed3
eAB
=
make_mixed3
(
apos1
.
x
-
apos0
.
x
,
apos1
.
y
-
apos0
.
y
,
apos1
.
z
-
apos0
.
z
);
mixed3
eBC
=
make_mixed3
(
apos2
.
x
-
apos1
.
x
,
apos2
.
y
-
apos1
.
y
,
apos2
.
z
-
apos1
.
z
);
mixed3
eCA
=
make_mixed3
(
apos0
.
x
-
apos2
.
x
,
apos0
.
y
-
apos2
.
y
,
apos0
.
z
-
apos2
.
z
);
eAB
*=
rsqrt
(
eAB
.
x
*
eAB
.
x
+
eAB
.
y
*
eAB
.
y
+
eAB
.
z
*
eAB
.
z
);
eBC
*=
rsqrt
(
eBC
.
x
*
eBC
.
x
+
eBC
.
y
*
eBC
.
y
+
eBC
.
z
*
eBC
.
z
);
eCA
*=
rsqrt
(
eCA
.
x
*
eCA
.
x
+
eCA
.
y
*
eCA
.
y
+
eCA
.
z
*
eCA
.
z
);
eAB
*=
RSQRT
(
eAB
.
x
*
eAB
.
x
+
eAB
.
y
*
eAB
.
y
+
eAB
.
z
*
eAB
.
z
);
eBC
*=
RSQRT
(
eBC
.
x
*
eBC
.
x
+
eBC
.
y
*
eBC
.
y
+
eBC
.
z
*
eBC
.
z
);
eCA
*=
RSQRT
(
eCA
.
x
*
eCA
.
x
+
eCA
.
y
*
eCA
.
y
+
eCA
.
z
*
eCA
.
z
);
mixed
vAB
=
(
v1
.
x
-
v0
.
x
)
*
eAB
.
x
+
(
v1
.
y
-
v0
.
y
)
*
eAB
.
y
+
(
v1
.
z
-
v0
.
z
)
*
eAB
.
z
;
mixed
vBC
=
(
v2
.
x
-
v1
.
x
)
*
eBC
.
x
+
(
v2
.
y
-
v1
.
y
)
*
eBC
.
y
+
(
v2
.
z
-
v1
.
z
)
*
eBC
.
z
;
mixed
vCA
=
(
v0
.
x
-
v2
.
x
)
*
eCA
.
x
+
(
v0
.
y
-
v2
.
y
)
*
eCA
.
y
+
(
v0
.
z
-
v2
.
z
)
*
eCA
.
z
;
...
...
@@ -574,7 +574,8 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
/**
* Compute the direction each CCMA constraint is pointing in. This is called once at the beginning of constraint evaluation.
*/
extern
"C"
__global__
void
computeCCMAConstraintDirections
(
const
int2
*
__restrict__
constraintAtoms
,
mixed4
*
__restrict__
constraintDistance
,
const
real4
*
__restrict__
atomPositions
,
const
real4
*
__restrict__
posqCorrection
)
{
extern
"C"
__global__
void
computeCCMAConstraintDirections
(
const
int2
*
__restrict__
constraintAtoms
,
mixed4
*
__restrict__
constraintDistance
,
const
real4
*
__restrict__
atomPositions
,
const
real4
*
__restrict__
posqCorrection
,
int
*
__restrict__
converged
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_CCMA_CONSTRAINTS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Compute the direction for this constraint.
...
...
@@ -587,6 +588,10 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric
dir
.
z
=
oldPos1
.
z
-
oldPos2
.
z
;
constraintDistance
[
index
]
=
dir
;
}
if
(
threadIdx
.
x
==
0
&&
blockIdx
.
x
==
0
)
{
converged
[
0
]
=
1
;
converged
[
1
]
=
0
;
}
}
/**
...
...
@@ -605,6 +610,7 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
__syncthreads
();
mixed
lowerTol
=
1
-
2
*
tol
+
tol
*
tol
;
mixed
upperTol
=
1
+
2
*
tol
+
tol
*
tol
;
bool
threadConverged
=
true
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_CCMA_CONSTRAINTS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Compute the force due to this constraint.
...
...
@@ -620,14 +626,13 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
mixed
dist2
=
dir
.
w
*
dir
.
w
;
mixed
diff
=
dist2
-
rp2
;
delta1
[
index
]
=
(
rrpr
>
d_ij2
*
1e-6
f
?
reducedMass
[
index
]
*
diff
/
rrpr
:
0.0
f
);
// See whether it has converged.
if
(
groupConverged
&&
(
rp2
<
lowerTol
*
dist2
||
rp2
>
upperTol
*
dist2
))
{
groupConverged
=
0
;
converged
[
iteration
%
2
]
=
0
;
}
threadConverged
&=
(
rp2
>
lowerTol
*
dist2
&&
rp2
<
upperTol
*
dist2
);
}
if
(
groupConverged
&&
!
threadConverged
)
groupConverged
=
0
;
__syncthreads
();
if
(
threadIdx
.
x
==
0
&&
!
groupConverged
)
converged
[
iteration
%
2
]
=
0
;
}
/**
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment