Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
93c467b2
Commit
93c467b2
authored
Mar 22, 2013
by
Peter Eastman
Browse files
Merged 5.1Optimizations branch back to trunk
parent
f6d4557d
Changes
86
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1724 additions
and
1166 deletions
+1724
-1166
platforms/cuda/src/CudaContext.cpp
platforms/cuda/src/CudaContext.cpp
+2
-1
platforms/cuda/src/CudaContext.h
platforms/cuda/src/CudaContext.h
+2
-0
platforms/cuda/src/CudaIntegrationUtilities.cpp
platforms/cuda/src/CudaIntegrationUtilities.cpp
+14
-16
platforms/cuda/src/CudaIntegrationUtilities.h
platforms/cuda/src/CudaIntegrationUtilities.h
+2
-3
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+99
-71
platforms/cuda/src/CudaKernels.h
platforms/cuda/src/CudaKernels.h
+3
-7
platforms/cuda/src/CudaNonbondedUtilities.cpp
platforms/cuda/src/CudaNonbondedUtilities.cpp
+171
-100
platforms/cuda/src/CudaNonbondedUtilities.h
platforms/cuda/src/CudaNonbondedUtilities.h
+35
-23
platforms/cuda/src/CudaParallelKernels.cpp
platforms/cuda/src/CudaParallelKernels.cpp
+15
-17
platforms/cuda/src/CudaParallelKernels.h
platforms/cuda/src/CudaParallelKernels.h
+2
-2
platforms/cuda/src/CudaSort.cpp
platforms/cuda/src/CudaSort.cpp
+42
-32
platforms/cuda/src/CudaSort.h
platforms/cuda/src/CudaSort.h
+3
-2
platforms/cuda/src/kernels/coulombLennardJones.cu
platforms/cuda/src/kernels/coulombLennardJones.cu
+3
-3
platforms/cuda/src/kernels/customGBEnergyN2.cu
platforms/cuda/src/kernels/customGBEnergyN2.cu
+267
-134
platforms/cuda/src/kernels/customGBValueN2.cu
platforms/cuda/src/kernels/customGBValueN2.cu
+223
-168
platforms/cuda/src/kernels/customHbondForce.cu
platforms/cuda/src/kernels/customHbondForce.cu
+2
-2
platforms/cuda/src/kernels/ewald.cu
platforms/cuda/src/kernels/ewald.cu
+6
-6
platforms/cuda/src/kernels/findInteractingBlocks.cu
platforms/cuda/src/kernels/findInteractingBlocks.cu
+261
-143
platforms/cuda/src/kernels/gbsaObc1.cu
platforms/cuda/src/kernels/gbsaObc1.cu
+539
-408
platforms/cuda/src/kernels/integrationUtilities.cu
platforms/cuda/src/kernels/integrationUtilities.cu
+33
-28
No files found.
platforms/cuda/src/CudaContext.cpp
View file @
93c467b2
...
@@ -61,7 +61,7 @@ using namespace OpenMM;
...
@@ -61,7 +61,7 @@ using namespace OpenMM;
using
namespace
std
;
using
namespace
std
;
const
int
CudaContext
::
ThreadBlockSize
=
64
;
const
int
CudaContext
::
ThreadBlockSize
=
64
;
const
int
CudaContext
::
TileSize
=
32
;
const
int
CudaContext
::
TileSize
=
sizeof
(
tileflags
)
*
8
;
bool
CudaContext
::
hasInitializedCuda
=
false
;
bool
CudaContext
::
hasInitializedCuda
=
false
;
CudaContext
::
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
string
&
precision
,
const
string
&
compiler
,
CudaContext
::
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
string
&
precision
,
const
string
&
compiler
,
...
@@ -369,6 +369,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
...
@@ -369,6 +369,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
src
<<
"typedef float3 mixed3;
\n
"
;
src
<<
"typedef float3 mixed3;
\n
"
;
src
<<
"typedef float4 mixed4;
\n
"
;
src
<<
"typedef float4 mixed4;
\n
"
;
}
}
src
<<
"typedef unsigned int tileflags;
\n
"
;
for
(
map
<
string
,
string
>::
const_iterator
iter
=
defines
.
begin
();
iter
!=
defines
.
end
();
++
iter
)
{
for
(
map
<
string
,
string
>::
const_iterator
iter
=
defines
.
begin
();
iter
!=
defines
.
end
();
++
iter
)
{
src
<<
"#define "
<<
iter
->
first
;
src
<<
"#define "
<<
iter
->
first
;
if
(
!
iter
->
second
.
empty
())
if
(
!
iter
->
second
.
empty
())
...
...
platforms/cuda/src/CudaContext.h
View file @
93c467b2
...
@@ -42,6 +42,8 @@
...
@@ -42,6 +42,8 @@
#include "windowsExportCuda.h"
#include "windowsExportCuda.h"
#include "CudaPlatform.h"
#include "CudaPlatform.h"
typedef
unsigned
int
tileflags
;
namespace
OpenMM
{
namespace
OpenMM
{
class
CudaArray
;
class
CudaArray
;
...
...
platforms/cuda/src/CudaIntegrationUtilities.cpp
View file @
93c467b2
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2009-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -99,7 +99,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
...
@@ -99,7 +99,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
posDelta
(
NULL
),
settleAtoms
(
NULL
),
settleParams
(
NULL
),
shakeAtoms
(
NULL
),
shakeParams
(
NULL
),
posDelta
(
NULL
),
settleAtoms
(
NULL
),
settleParams
(
NULL
),
shakeAtoms
(
NULL
),
shakeParams
(
NULL
),
random
(
NULL
),
randomSeed
(
NULL
),
randomPos
(
0
),
stepSize
(
NULL
),
ccmaAtoms
(
NULL
),
ccmaDistance
(
NULL
),
random
(
NULL
),
randomSeed
(
NULL
),
randomPos
(
0
),
stepSize
(
NULL
),
ccmaAtoms
(
NULL
),
ccmaDistance
(
NULL
),
ccmaReducedMass
(
NULL
),
ccmaAtomConstraints
(
NULL
),
ccmaNumAtomConstraints
(
NULL
),
ccmaConstraintMatrixColumn
(
NULL
),
ccmaReducedMass
(
NULL
),
ccmaAtomConstraints
(
NULL
),
ccmaNumAtomConstraints
(
NULL
),
ccmaConstraintMatrixColumn
(
NULL
),
ccmaConstraintMatrixValue
(
NULL
),
ccmaDelta1
(
NULL
),
ccmaDelta2
(
NULL
),
ccmaConverged
Memory
(
NULL
),
ccmaConstraintMatrixValue
(
NULL
),
ccmaDelta1
(
NULL
),
ccmaDelta2
(
NULL
),
ccmaConverged
(
NULL
),
vsite2AvgAtoms
(
NULL
),
vsite2AvgWeights
(
NULL
),
vsite3AvgAtoms
(
NULL
),
vsite3AvgWeights
(
NULL
),
vsite2AvgAtoms
(
NULL
),
vsite2AvgWeights
(
NULL
),
vsite3AvgAtoms
(
NULL
),
vsite3AvgWeights
(
NULL
),
vsiteOutOfPlaneAtoms
(
NULL
),
vsiteOutOfPlaneWeights
(
NULL
)
{
vsiteOutOfPlaneAtoms
(
NULL
),
vsiteOutOfPlaneWeights
(
NULL
)
{
// Create workspace arrays.
// Create workspace arrays.
...
@@ -466,9 +466,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
...
@@ -466,9 +466,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
ccmaAtoms
=
CudaArray
::
create
<
int2
>
(
context
,
numCCMA
,
"CcmaAtoms"
);
ccmaAtoms
=
CudaArray
::
create
<
int2
>
(
context
,
numCCMA
,
"CcmaAtoms"
);
ccmaAtomConstraints
=
CudaArray
::
create
<
int
>
(
context
,
numAtoms
*
maxAtomConstraints
,
"CcmaAtomConstraints"
);
ccmaAtomConstraints
=
CudaArray
::
create
<
int
>
(
context
,
numAtoms
*
maxAtomConstraints
,
"CcmaAtomConstraints"
);
ccmaNumAtomConstraints
=
CudaArray
::
create
<
int
>
(
context
,
numAtoms
,
"CcmaAtomConstraintsIndex"
);
ccmaNumAtomConstraints
=
CudaArray
::
create
<
int
>
(
context
,
numAtoms
,
"CcmaAtomConstraintsIndex"
);
CHECK_RESULT2
(
cuMemHostAlloc
((
void
**
)
&
ccmaConvergedMemory
,
2
*
sizeof
(
int
),
CU_MEMHOSTALLOC_DEVICEMAP
),
"Error allocating pinned memory"
);
CHECK_RESULT2
(
cuMemHostGetDevicePointer
(
&
ccmaConvergedDeviceMemory
,
ccmaConvergedMemory
,
0
),
"Error getting device address for pinned memory"
);
ccmaConstraintMatrixColumn
=
CudaArray
::
create
<
int
>
(
context
,
numCCMA
*
maxRowElements
,
"ConstraintMatrixColumn"
);
ccmaConstraintMatrixColumn
=
CudaArray
::
create
<
int
>
(
context
,
numCCMA
*
maxRowElements
,
"ConstraintMatrixColumn"
);
ccmaConverged
=
CudaArray
::
create
<
int
>
(
context
,
2
,
"ccmaConverged"
);
vector
<
int2
>
atomsVec
(
ccmaAtoms
->
getSize
());
vector
<
int2
>
atomsVec
(
ccmaAtoms
->
getSize
());
vector
<
int
>
atomConstraintsVec
(
ccmaAtomConstraints
->
getSize
());
vector
<
int
>
atomConstraintsVec
(
ccmaAtomConstraints
->
getSize
());
vector
<
int
>
numAtomConstraintsVec
(
ccmaNumAtomConstraints
->
getSize
());
vector
<
int
>
numAtomConstraintsVec
(
ccmaNumAtomConstraints
->
getSize
());
...
@@ -680,8 +679,8 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
...
@@ -680,8 +679,8 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
delete
ccmaDelta1
;
delete
ccmaDelta1
;
if
(
ccmaDelta2
!=
NULL
)
if
(
ccmaDelta2
!=
NULL
)
delete
ccmaDelta2
;
delete
ccmaDelta2
;
if
(
ccmaConverged
Memory
!=
NULL
)
if
(
ccmaConverged
!=
NULL
)
cuMemFreeHost
(
ccmaConverged
Memory
)
;
delete
ccmaConverged
;
if
(
vsite2AvgAtoms
!=
NULL
)
if
(
vsite2AvgAtoms
!=
NULL
)
delete
vsite2AvgAtoms
;
delete
vsite2AvgAtoms
;
if
(
vsite2AvgWeights
!=
NULL
)
if
(
vsite2AvgWeights
!=
NULL
)
...
@@ -734,33 +733,32 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
...
@@ -734,33 +733,32 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
context
.
executeKernel
(
shakeKernel
,
args
,
shakeAtoms
->
getSize
());
context
.
executeKernel
(
shakeKernel
,
args
,
shakeAtoms
->
getSize
());
}
}
if
(
ccmaAtoms
!=
NULL
)
{
if
(
ccmaAtoms
!=
NULL
)
{
void
*
directionsArgs
[]
=
{
&
ccmaAtoms
->
getDevicePointer
(),
&
ccmaDistance
->
getDevicePointer
(),
&
context
.
getPosq
().
getDevicePointer
(),
&
posCorrection
};
void
*
directionsArgs
[]
=
{
&
ccmaAtoms
->
getDevicePointer
(),
&
ccmaDistance
->
getDevicePointer
(),
&
context
.
getPosq
().
getDevicePointer
(),
&
posCorrection
,
&
ccmaConverged
->
getDevicePointer
()
};
context
.
executeKernel
(
ccmaDirectionsKernel
,
directionsArgs
,
ccmaAtoms
->
getSize
());
context
.
executeKernel
(
ccmaDirectionsKernel
,
directionsArgs
,
ccmaAtoms
->
getSize
());
int
i
;
int
i
;
void
*
forceArgs
[]
=
{
&
ccmaAtoms
->
getDevicePointer
(),
&
ccmaDistance
->
getDevicePointer
(),
void
*
forceArgs
[]
=
{
&
ccmaAtoms
->
getDevicePointer
(),
&
ccmaDistance
->
getDevicePointer
(),
constrainVelocities
?
&
context
.
getVelm
().
getDevicePointer
()
:
&
posDelta
->
getDevicePointer
(),
constrainVelocities
?
&
context
.
getVelm
().
getDevicePointer
()
:
&
posDelta
->
getDevicePointer
(),
&
ccmaReducedMass
->
getDevicePointer
(),
&
ccmaDelta1
->
getDevicePointer
(),
&
ccmaConvergedDevice
Memory
,
&
ccmaReducedMass
->
getDevicePointer
(),
&
ccmaDelta1
->
getDevicePointer
(),
&
ccmaConverged
->
get
Device
Pointer
()
,
tolPointer
,
&
i
};
tolPointer
,
&
i
};
void
*
multiplyArgs
[]
=
{
&
ccmaDelta1
->
getDevicePointer
(),
&
ccmaDelta2
->
getDevicePointer
(),
void
*
multiplyArgs
[]
=
{
&
ccmaDelta1
->
getDevicePointer
(),
&
ccmaDelta2
->
getDevicePointer
(),
&
ccmaConstraintMatrixColumn
->
getDevicePointer
(),
&
ccmaConstraintMatrixValue
->
getDevicePointer
(),
&
ccmaConvergedDevice
Memory
,
&
i
};
&
ccmaConstraintMatrixColumn
->
getDevicePointer
(),
&
ccmaConstraintMatrixValue
->
getDevicePointer
(),
&
ccmaConverged
->
get
Device
Pointer
()
,
&
i
};
void
*
updateArgs
[]
=
{
&
ccmaNumAtomConstraints
->
getDevicePointer
(),
&
ccmaAtomConstraints
->
getDevicePointer
(),
&
ccmaDistance
->
getDevicePointer
(),
void
*
updateArgs
[]
=
{
&
ccmaNumAtomConstraints
->
getDevicePointer
(),
&
ccmaAtomConstraints
->
getDevicePointer
(),
&
ccmaDistance
->
getDevicePointer
(),
constrainVelocities
?
&
context
.
getVelm
().
getDevicePointer
()
:
&
posDelta
->
getDevicePointer
(),
constrainVelocities
?
&
context
.
getVelm
().
getDevicePointer
()
:
&
posDelta
->
getDevicePointer
(),
&
context
.
getVelm
().
getDevicePointer
(),
&
ccmaDelta1
->
getDevicePointer
(),
&
ccmaDelta2
->
getDevicePointer
(),
&
context
.
getVelm
().
getDevicePointer
(),
&
ccmaDelta1
->
getDevicePointer
(),
&
ccmaDelta2
->
getDevicePointer
(),
&
ccmaConvergedDevice
Memory
,
&
i
};
&
ccmaConverged
->
get
Device
Pointer
()
,
&
i
};
const
int
checkInterval
=
4
;
const
int
checkInterval
=
4
;
int
*
converged
=
(
int
*
)
context
.
getPinnedBuffer
();
for
(
i
=
0
;
i
<
150
;
i
++
)
{
for
(
i
=
0
;
i
<
150
;
i
++
)
{
if
(
i
==
0
)
{
ccmaConvergedMemory
[
0
]
=
1
;
ccmaConvergedMemory
[
1
]
=
0
;
}
context
.
executeKernel
(
ccmaForceKernel
,
forceArgs
,
ccmaAtoms
->
getSize
());
context
.
executeKernel
(
ccmaForceKernel
,
forceArgs
,
ccmaAtoms
->
getSize
());
if
((
i
+
1
)
%
checkInterval
==
0
)
if
((
i
+
1
)
%
checkInterval
==
0
)
{
ccmaConverged
->
download
(
converged
,
false
);
CHECK_RESULT2
(
cuEventRecord
(
ccmaEvent
,
0
),
"Error recording event for CCMA"
);
CHECK_RESULT2
(
cuEventRecord
(
ccmaEvent
,
0
),
"Error recording event for CCMA"
);
}
context
.
executeKernel
(
ccmaMultiplyKernel
,
multiplyArgs
,
ccmaAtoms
->
getSize
());
context
.
executeKernel
(
ccmaMultiplyKernel
,
multiplyArgs
,
ccmaAtoms
->
getSize
());
context
.
executeKernel
(
ccmaUpdateKernel
,
updateArgs
,
context
.
getNumAtoms
());
context
.
executeKernel
(
ccmaUpdateKernel
,
updateArgs
,
context
.
getNumAtoms
());
if
((
i
+
1
)
%
checkInterval
==
0
)
{
if
((
i
+
1
)
%
checkInterval
==
0
)
{
CHECK_RESULT2
(
cuEventSynchronize
(
ccmaEvent
),
"Error synchronizing on event for CCMA"
);
CHECK_RESULT2
(
cuEventSynchronize
(
ccmaEvent
),
"Error synchronizing on event for CCMA"
);
if
(
c
cmaC
onverged
Memory
[
i
%
2
])
if
(
converged
[
i
%
2
])
break
;
break
;
}
}
}
}
...
...
platforms/cuda/src/CudaIntegrationUtilities.h
View file @
93c467b2
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2009-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -140,8 +140,7 @@ private:
...
@@ -140,8 +140,7 @@ private:
CudaArray
*
ccmaConstraintMatrixValue
;
CudaArray
*
ccmaConstraintMatrixValue
;
CudaArray
*
ccmaDelta1
;
CudaArray
*
ccmaDelta1
;
CudaArray
*
ccmaDelta2
;
CudaArray
*
ccmaDelta2
;
int
*
ccmaConvergedMemory
;
CudaArray
*
ccmaConverged
;
CUdeviceptr
ccmaConvergedDeviceMemory
;
CUevent
ccmaEvent
;
CUevent
ccmaEvent
;
CudaArray
*
vsite2AvgAtoms
;
CudaArray
*
vsite2AvgAtoms
;
CudaArray
*
vsite2AvgWeights
;
CudaArray
*
vsite2AvgWeights
;
...
...
platforms/cuda/src/CudaKernels.cpp
View file @
93c467b2
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2008-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2008-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -1351,10 +1351,6 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
...
@@ -1351,10 +1351,6 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
delete
pmeBsplineModuliY
;
delete
pmeBsplineModuliY
;
if
(
pmeBsplineModuliZ
!=
NULL
)
if
(
pmeBsplineModuliZ
!=
NULL
)
delete
pmeBsplineModuliZ
;
delete
pmeBsplineModuliZ
;
if
(
pmeBsplineTheta
!=
NULL
)
delete
pmeBsplineTheta
;
if
(
pmeBsplineDTheta
!=
NULL
)
delete
pmeBsplineDTheta
;
if
(
pmeAtomRange
!=
NULL
)
if
(
pmeAtomRange
!=
NULL
)
delete
pmeAtomRange
;
delete
pmeAtomRange
;
if
(
pmeAtomGridIndex
!=
NULL
)
if
(
pmeAtomGridIndex
!=
NULL
)
...
@@ -1507,13 +1503,13 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1507,13 +1503,13 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
if
(
cu
.
getUseDoublePrecision
())
if
(
cu
.
getUseDoublePrecision
())
pmeDefines
[
"USE_DOUBLE_PRECISION"
]
=
"1"
;
pmeDefines
[
"USE_DOUBLE_PRECISION"
]
=
"1"
;
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
pme
,
pmeDefines
);
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
pme
,
pmeDefines
);
pmeUpdateBsplinesKernel
=
cu
.
getKernel
(
module
,
"updateBsplines"
);
pmeGridIndexKernel
=
cu
.
getKernel
(
module
,
"findAtomGridIndex"
);
pmeAtomRangeKernel
=
cu
.
getKernel
(
module
,
"findAtomRangeForGrid"
);
pmeSpreadChargeKernel
=
cu
.
getKernel
(
module
,
"gridSpreadCharge"
);
pmeSpreadChargeKernel
=
cu
.
getKernel
(
module
,
"gridSpreadCharge"
);
pmeConvolutionKernel
=
cu
.
getKernel
(
module
,
"reciprocalConvolution"
);
pmeConvolutionKernel
=
cu
.
getKernel
(
module
,
"reciprocalConvolution"
);
pmeInterpolateForceKernel
=
cu
.
getKernel
(
module
,
"gridInterpolateForce"
);
pmeInterpolateForceKernel
=
cu
.
getKernel
(
module
,
"gridInterpolateForce"
);
pmeEvalEnergyKernel
=
cu
.
getKernel
(
module
,
"gridEvaluateEnergy"
);
pmeEvalEnergyKernel
=
cu
.
getKernel
(
module
,
"gridEvaluateEnergy"
);
pmeFinishSpreadChargeKernel
=
cu
.
getKernel
(
module
,
"finishSpreadCharge"
);
pmeFinishSpreadChargeKernel
=
cu
.
getKernel
(
module
,
"finishSpreadCharge"
);
cuFuncSetCacheConfig
(
pmeSpreadChargeKernel
,
CU_FUNC_CACHE_PREFER_L1
);
cuFuncSetCacheConfig
(
pmeInterpolateForceKernel
,
CU_FUNC_CACHE_PREFER_L1
);
cuFuncSetCacheConfig
(
pmeInterpolateForceKernel
,
CU_FUNC_CACHE_PREFER_L1
);
// Create required data structures.
// Create required data structures.
...
@@ -1528,7 +1524,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1528,7 +1524,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeBsplineModuliX
=
new
CudaArray
(
cu
,
gridSizeX
,
elementSize
,
"pmeBsplineModuliX"
);
pmeBsplineModuliX
=
new
CudaArray
(
cu
,
gridSizeX
,
elementSize
,
"pmeBsplineModuliX"
);
pmeBsplineModuliY
=
new
CudaArray
(
cu
,
gridSizeY
,
elementSize
,
"pmeBsplineModuliY"
);
pmeBsplineModuliY
=
new
CudaArray
(
cu
,
gridSizeY
,
elementSize
,
"pmeBsplineModuliY"
);
pmeBsplineModuliZ
=
new
CudaArray
(
cu
,
gridSizeZ
,
elementSize
,
"pmeBsplineModuliZ"
);
pmeBsplineModuliZ
=
new
CudaArray
(
cu
,
gridSizeZ
,
elementSize
,
"pmeBsplineModuliZ"
);
pmeBsplineTheta
=
new
CudaArray
(
cu
,
PmeOrder
*
numParticles
,
4
*
elementSize
,
"pmeBsplineTheta"
);
pmeAtomRange
=
CudaArray
::
create
<
int
>
(
cu
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomRange
=
CudaArray
::
create
<
int
>
(
cu
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomGridIndex
=
CudaArray
::
create
<
int2
>
(
cu
,
numParticles
,
"pmeAtomGridIndex"
);
pmeAtomGridIndex
=
CudaArray
::
create
<
int2
>
(
cu
,
numParticles
,
"pmeAtomGridIndex"
);
sort
=
new
CudaSort
(
cu
,
new
SortTrait
(),
cu
.
getNumAtoms
());
sort
=
new
CudaSort
(
cu
,
new
SortTrait
(),
cu
.
getNumAtoms
());
...
@@ -1659,20 +1654,14 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1659,20 +1654,14 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu
.
executeKernel
(
ewaldForcesKernel
,
forcesArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
ewaldForcesKernel
,
forcesArgs
,
cu
.
getNumAtoms
());
}
}
if
(
directPmeGrid
!=
NULL
&&
cu
.
getContextIndex
()
==
0
&&
includeReciprocal
)
{
if
(
directPmeGrid
!=
NULL
&&
cu
.
getContextIndex
()
==
0
&&
includeReciprocal
)
{
void
*
bsplinesArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
pmeBsplineTheta
->
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
void
*
gridIndexArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
pmeGridIndexKernel
,
gridIndexArgs
,
cu
.
getNumAtoms
());
int
bsplinesSharedSize
=
cu
.
ThreadBlockSize
*
PmeOrder
*
(
cu
.
getUseDoublePrecision
()
?
sizeof
(
double4
)
:
sizeof
(
float4
));
cu
.
executeKernel
(
pmeUpdateBsplinesKernel
,
bsplinesArgs
,
cu
.
getNumAtoms
(),
cu
.
ThreadBlockSize
,
bsplinesSharedSize
);
sort
->
sort
(
*
pmeAtomGridIndex
);
sort
->
sort
(
*
pmeAtomGridIndex
);
void
*
rangeArgs
[]
=
{
&
pmeAtomGridIndex
->
getDevicePointer
(),
&
pmeAtomRange
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
void
*
spreadArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
directPmeGrid
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
getInvPeriodicBoxSizePointer
(),
&
pmeAtomGridIndex
->
getDevicePointer
()};
cu
.
executeKernel
(
pmeAtomRangeKernel
,
rangeArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeSpreadChargeKernel
,
spreadArgs
,
cu
.
getNumAtoms
(),
128
);
void
*
spreadArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
directPmeGrid
->
getDevicePointer
(),
&
pmeBsplineTheta
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
pmeSpreadChargeKernel
,
spreadArgs
,
cu
.
getNumAtoms
(),
PmeOrder
*
PmeOrder
*
PmeOrder
);
void
*
finishSpreadArgs
[]
=
{
&
directPmeGrid
->
getDevicePointer
()};
if
(
cu
.
getUseDoublePrecision
()
||
cu
.
getComputeCapability
()
<
2.0
)
{
if
(
cu
.
getUseDoublePrecision
()
||
cu
.
getComputeCapability
()
<
2.0
)
{
void
*
finishSpreadArgs
[]
=
{
&
directPmeGrid
->
getDevicePointer
()};
void
*
finishSpreadArgs
[]
=
{
&
directPmeGrid
->
getDevicePointer
()};
...
@@ -1699,8 +1688,8 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1699,8 +1688,8 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
void
*
interpolateArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
(),
&
directPmeGrid
->
getDevicePointer
(),
void
*
interpolateArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
(),
&
directPmeGrid
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()
,
&
pmeAtomGridIndex
->
getDevicePointer
()
};
cu
.
executeKernel
(
pmeInterpolateForceKernel
,
interpolateArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
pmeInterpolateForceKernel
,
interpolateArgs
,
cu
.
getNumAtoms
()
,
128
);
}
}
double
energy
=
(
includeReciprocal
?
ewaldSelfEnergy
:
0.0
);
double
energy
=
(
includeReciprocal
?
ewaldSelfEnergy
:
0.0
);
...
@@ -2071,6 +2060,14 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
...
@@ -2071,6 +2060,14 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
defines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
cu
.
intToString
(
cu
.
getNumAtomBlocks
());
defines
[
"NUM_BLOCKS"
]
=
cu
.
intToString
(
cu
.
getNumAtomBlocks
());
defines
[
"FORCE_WORK_GROUP_SIZE"
]
=
cu
.
intToString
(
nb
.
getForceThreadBlockSize
());
defines
[
"FORCE_WORK_GROUP_SIZE"
]
=
cu
.
intToString
(
nb
.
getForceThreadBlockSize
());
defines
[
"TILE_SIZE"
]
=
cu
.
intToString
(
CudaContext
::
TileSize
);
int
numExclusionTiles
=
nb
.
getExclusionTiles
().
getSize
();
defines
[
"NUM_TILES_WITH_EXCLUSIONS"
]
=
cu
.
intToString
(
numExclusionTiles
);
int
numContexts
=
cu
.
getPlatformData
().
contexts
.
size
();
int
startExclusionIndex
=
cu
.
getContextIndex
()
*
numExclusionTiles
/
numContexts
;
int
endExclusionIndex
=
(
cu
.
getContextIndex
()
+
1
)
*
numExclusionTiles
/
numContexts
;
defines
[
"FIRST_EXCLUSION_TILE"
]
=
cu
.
intToString
(
startExclusionIndex
);
defines
[
"LAST_EXCLUSION_TILE"
]
=
cu
.
intToString
(
endExclusionIndex
);
map
<
string
,
string
>
replacements
;
map
<
string
,
string
>
replacements
;
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
cu
.
replaceStrings
(
CudaKernelSources
::
gbsaObc1
,
replacements
),
defines
);
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
cu
.
replaceStrings
(
CudaKernelSources
::
gbsaObc1
,
replacements
),
defines
);
computeBornSumKernel
=
cu
.
getKernel
(
module
,
"computeBornSum"
);
computeBornSumKernel
=
cu
.
getKernel
(
module
,
"computeBornSum"
);
...
@@ -2083,12 +2080,12 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
...
@@ -2083,12 +2080,12 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
computeSumArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
computeSumArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
computeSumArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
computeSumArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
computeSumArgs
.
push_back
(
&
maxTiles
);
computeSumArgs
.
push_back
(
&
maxTiles
);
computeSumArgs
.
push_back
(
&
nb
.
getInteractionFlags
().
getDevicePointer
());
computeSumArgs
.
push_back
(
&
nb
.
getBlockCenters
().
getDevicePointer
());
computeSumArgs
.
push_back
(
&
nb
.
getInteractingAtoms
().
getDevicePointer
());
}
}
else
else
computeSumArgs
.
push_back
(
&
maxTiles
);
computeSumArgs
.
push_back
(
&
maxTiles
);
computeSumArgs
.
push_back
(
&
nb
.
getExclusionIndices
().
getDevicePointer
());
computeSumArgs
.
push_back
(
&
nb
.
getExclusionTiles
().
getDevicePointer
());
computeSumArgs
.
push_back
(
&
nb
.
getExclusionRowIndices
().
getDevicePointer
());
force1Kernel
=
cu
.
getKernel
(
module
,
"computeGBSAForce1"
);
force1Kernel
=
cu
.
getKernel
(
module
,
"computeGBSAForce1"
);
force1Args
.
push_back
(
&
cu
.
getForce
().
getDevicePointer
());
force1Args
.
push_back
(
&
cu
.
getForce
().
getDevicePointer
());
force1Args
.
push_back
(
&
bornForce
->
getDevicePointer
());
force1Args
.
push_back
(
&
bornForce
->
getDevicePointer
());
...
@@ -2101,12 +2098,12 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
...
@@ -2101,12 +2098,12 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
force1Args
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
force1Args
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
force1Args
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
force1Args
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
force1Args
.
push_back
(
&
maxTiles
);
force1Args
.
push_back
(
&
maxTiles
);
force1Args
.
push_back
(
&
nb
.
getInteractionFlags
().
getDevicePointer
());
force1Args
.
push_back
(
&
nb
.
getBlockCenters
().
getDevicePointer
());
force1Args
.
push_back
(
&
nb
.
getInteractingAtoms
().
getDevicePointer
());
}
}
else
else
force1Args
.
push_back
(
&
maxTiles
);
force1Args
.
push_back
(
&
maxTiles
);
force1Args
.
push_back
(
&
nb
.
getExclusionIndices
().
getDevicePointer
());
force1Args
.
push_back
(
&
nb
.
getExclusionTiles
().
getDevicePointer
());
force1Args
.
push_back
(
&
nb
.
getExclusionRowIndices
().
getDevicePointer
());
reduceBornSumKernel
=
cu
.
getKernel
(
module
,
"reduceBornSum"
);
reduceBornSumKernel
=
cu
.
getKernel
(
module
,
"reduceBornSum"
);
reduceBornForceKernel
=
cu
.
getKernel
(
module
,
"reduceBornForce"
);
reduceBornForceKernel
=
cu
.
getKernel
(
module
,
"reduceBornForce"
);
}
}
...
@@ -2115,8 +2112,8 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
...
@@ -2115,8 +2112,8 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
computeSumArgs
[
3
]
=
&
nb
.
getInteractingTiles
().
getDevicePointer
();
computeSumArgs
[
3
]
=
&
nb
.
getInteractingTiles
().
getDevicePointer
();
force1Args
[
5
]
=
&
nb
.
getInteractingTiles
().
getDevicePointer
();
force1Args
[
5
]
=
&
nb
.
getInteractingTiles
().
getDevicePointer
();
computeSumArgs
[
8
]
=
&
nb
.
getInteracti
onFlag
s
().
getDevicePointer
();
computeSumArgs
[
9
]
=
&
nb
.
getInteracti
ngAtom
s
().
getDevicePointer
();
force1Args
[
1
0
]
=
&
nb
.
getInteracti
onFlag
s
().
getDevicePointer
();
force1Args
[
1
1
]
=
&
nb
.
getInteracti
ngAtom
s
().
getDevicePointer
();
}
}
}
}
cu
.
executeKernel
(
computeBornSumKernel
,
&
computeSumArgs
[
0
],
nb
.
getNumForceThreadBlocks
()
*
nb
.
getForceThreadBlockSize
(),
nb
.
getForceThreadBlockSize
());
cu
.
executeKernel
(
computeBornSumKernel
,
&
computeSumArgs
[
0
],
nb
.
getNumForceThreadBlocks
()
*
nb
.
getForceThreadBlockSize
(),
nb
.
getForceThreadBlockSize
());
...
@@ -2244,16 +2241,17 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
...
@@ -2244,16 +2241,17 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
// Record parameters and exclusions.
// Record parameters and exclusions.
int
numParticles
=
force
.
getNumParticles
();
int
numParticles
=
force
.
getNumParticles
();
params
=
new
CudaParameterSet
(
cu
,
force
.
getNumPerParticleParameters
(),
numParticles
,
"customGBParameters"
,
true
);
int
paddedNumParticles
=
cu
.
getPaddedNumAtoms
();
computedValues
=
new
CudaParameterSet
(
cu
,
force
.
getNumComputedValues
(),
numParticles
,
"customGBComputedValues"
,
true
,
cu
.
getUseDoublePrecision
());
int
numParams
=
force
.
getNumPerParticleParameters
();
params
=
new
CudaParameterSet
(
cu
,
force
.
getNumPerParticleParameters
(),
paddedNumParticles
,
"customGBParameters"
,
true
);
computedValues
=
new
CudaParameterSet
(
cu
,
force
.
getNumComputedValues
(),
paddedNumParticles
,
"customGBComputedValues"
,
true
,
cu
.
getUseDoublePrecision
());
if
(
force
.
getNumGlobalParameters
()
>
0
)
if
(
force
.
getNumGlobalParameters
()
>
0
)
globals
=
CudaArray
::
create
<
float
>
(
cu
,
force
.
getNumGlobalParameters
(),
"customGBGlobals"
);
globals
=
CudaArray
::
create
<
float
>
(
cu
,
force
.
getNumGlobalParameters
(),
"customGBGlobals"
);
vector
<
vector
<
float
>
>
paramVector
(
n
umParticles
);
vector
<
vector
<
float
>
>
paramVector
(
paddedN
umParticles
,
vector
<
float
>
(
numParams
,
0
)
);
vector
<
vector
<
int
>
>
exclusionList
(
numParticles
);
vector
<
vector
<
int
>
>
exclusionList
(
numParticles
);
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
{
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
{
vector
<
double
>
parameters
;
vector
<
double
>
parameters
;
force
.
getParticleParameters
(
i
,
parameters
);
force
.
getParticleParameters
(
i
,
parameters
);
paramVector
[
i
].
resize
(
parameters
.
size
());
for
(
int
j
=
0
;
j
<
(
int
)
parameters
.
size
();
j
++
)
for
(
int
j
=
0
;
j
<
(
int
)
parameters
.
size
();
j
++
)
paramVector
[
i
][
j
]
=
(
float
)
parameters
[
j
];
paramVector
[
i
][
j
]
=
(
float
)
parameters
[
j
];
exclusionList
[
i
].
push_back
(
i
);
exclusionList
[
i
].
push_back
(
i
);
...
@@ -2406,23 +2404,22 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
...
@@ -2406,23 +2404,22 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
replacements
[
"LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"
]
=
loadLocal2
.
str
();
replacements
[
"LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"
]
=
loadLocal2
.
str
();
replacements
[
"LOAD_ATOM1_PARAMETERS"
]
=
load1
.
str
();
replacements
[
"LOAD_ATOM1_PARAMETERS"
]
=
load1
.
str
();
replacements
[
"LOAD_ATOM2_PARAMETERS"
]
=
load2
.
str
();
replacements
[
"LOAD_ATOM2_PARAMETERS"
]
=
load2
.
str
();
map
<
string
,
string
>
defines
;
if
(
useCutoff
)
if
(
useCutoff
)
d
efines
[
"USE_CUTOFF"
]
=
"1"
;
pairValueD
efines
[
"USE_CUTOFF"
]
=
"1"
;
if
(
usePeriodic
)
if
(
usePeriodic
)
d
efines
[
"USE_PERIODIC"
]
=
"1"
;
pairValueD
efines
[
"USE_PERIODIC"
]
=
"1"
;
if
(
useExclusionsForValue
)
if
(
useExclusionsForValue
)
d
efines
[
"USE_EXCLUSIONS"
]
=
"1"
;
pairValueD
efines
[
"USE_EXCLUSIONS"
]
=
"1"
;
if
(
atomParamSize
%
2
==
0
&&
!
cu
.
getUseDoublePrecision
())
if
(
atomParamSize
%
2
==
0
&&
!
cu
.
getUseDoublePrecision
())
d
efines
[
"NEED_PADDING"
]
=
"1"
;
pairValueD
efines
[
"NEED_PADDING"
]
=
"1"
;
d
efines
[
"WARPS_PER_GROUP"
]
=
cu
.
intToString
(
cu
.
getNonbondedUtilities
().
getForceThreadBlockSize
()
/
CudaContext
::
TileSize
);
pairValueD
efines
[
"WARPS_PER_GROUP"
]
=
cu
.
intToString
(
cu
.
getNonbondedUtilities
().
getForceThreadBlockSize
()
/
CudaContext
::
TileSize
);
d
efines
[
"THREAD_BLOCK_SIZE"
]
=
cu
.
intToString
(
cu
.
getNonbondedUtilities
().
getForceThreadBlockSize
());
pairValueD
efines
[
"THREAD_BLOCK_SIZE"
]
=
cu
.
intToString
(
cu
.
getNonbondedUtilities
().
getForceThreadBlockSize
());
d
efines
[
"CUTOFF_SQUARED"
]
=
cu
.
doubleToString
(
force
.
getCutoffDistance
()
*
force
.
getCutoffDistance
());
pairValueD
efines
[
"CUTOFF_SQUARED"
]
=
cu
.
doubleToString
(
force
.
getCutoffDistance
()
*
force
.
getCutoffDistance
());
d
efines
[
"NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getNumAtoms
());
pairValueD
efines
[
"NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getNumAtoms
());
d
efines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
pairValueD
efines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
d
efines
[
"NUM_BLOCKS"
]
=
cu
.
intToString
(
cu
.
getNumAtomBlocks
());
pairValueD
efines
[
"NUM_BLOCKS"
]
=
cu
.
intToString
(
cu
.
getNumAtomBlocks
());
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
cu
.
replace
String
s
(
Cuda
KernelSources
::
customGBValueN2
,
replacements
),
defines
);
pairValueDefines
[
"TILE_SIZE"
]
=
cu
.
intTo
String
(
Cuda
Context
::
TileSize
);
pairValue
Kernel
=
cu
.
getKernel
(
module
,
"computeN2Value"
);
pairValue
Src
=
cu
.
replaceStrings
(
CudaKernelSources
::
customGBValueN2
,
replacements
);
if
(
useExclusionsForValue
)
if
(
useExclusionsForValue
)
cu
.
getNonbondedUtilities
().
requestExclusions
(
exclusionList
);
cu
.
getNonbondedUtilities
().
requestExclusions
(
exclusionList
);
}
}
...
@@ -2574,23 +2571,22 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
...
@@ -2574,23 +2571,22 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
replacements
[
"RECORD_DERIVATIVE_2"
]
=
recordDeriv
.
str
();
replacements
[
"RECORD_DERIVATIVE_2"
]
=
recordDeriv
.
str
();
replacements
[
"STORE_DERIVATIVES_1"
]
=
storeDerivs1
.
str
();
replacements
[
"STORE_DERIVATIVES_1"
]
=
storeDerivs1
.
str
();
replacements
[
"STORE_DERIVATIVES_2"
]
=
storeDerivs2
.
str
();
replacements
[
"STORE_DERIVATIVES_2"
]
=
storeDerivs2
.
str
();
map
<
string
,
string
>
defines
;
if
(
useCutoff
)
if
(
useCutoff
)
d
efines
[
"USE_CUTOFF"
]
=
"1"
;
pairEnergyD
efines
[
"USE_CUTOFF"
]
=
"1"
;
if
(
usePeriodic
)
if
(
usePeriodic
)
d
efines
[
"USE_PERIODIC"
]
=
"1"
;
pairEnergyD
efines
[
"USE_PERIODIC"
]
=
"1"
;
if
(
anyExclusions
)
if
(
anyExclusions
)
d
efines
[
"USE_EXCLUSIONS"
]
=
"1"
;
pairEnergyD
efines
[
"USE_EXCLUSIONS"
]
=
"1"
;
if
(
atomParamSize
%
2
==
0
&&
!
cu
.
getUseDoublePrecision
())
if
(
atomParamSize
%
2
==
0
&&
!
cu
.
getUseDoublePrecision
())
d
efines
[
"NEED_PADDING"
]
=
"1"
;
pairEnergyD
efines
[
"NEED_PADDING"
]
=
"1"
;
d
efines
[
"THREAD_BLOCK_SIZE"
]
=
cu
.
intToString
(
cu
.
getNonbondedUtilities
().
getForceThreadBlockSize
());
pairEnergyD
efines
[
"THREAD_BLOCK_SIZE"
]
=
cu
.
intToString
(
cu
.
getNonbondedUtilities
().
getForceThreadBlockSize
());
d
efines
[
"WARPS_PER_GROUP"
]
=
cu
.
intToString
(
cu
.
getNonbondedUtilities
().
getForceThreadBlockSize
()
/
CudaContext
::
TileSize
);
pairEnergyD
efines
[
"WARPS_PER_GROUP"
]
=
cu
.
intToString
(
cu
.
getNonbondedUtilities
().
getForceThreadBlockSize
()
/
CudaContext
::
TileSize
);
d
efines
[
"CUTOFF_SQUARED"
]
=
cu
.
doubleToString
(
force
.
getCutoffDistance
()
*
force
.
getCutoffDistance
());
pairEnergyD
efines
[
"CUTOFF_SQUARED"
]
=
cu
.
doubleToString
(
force
.
getCutoffDistance
()
*
force
.
getCutoffDistance
());
d
efines
[
"NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getNumAtoms
());
pairEnergyD
efines
[
"NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getNumAtoms
());
d
efines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
pairEnergyD
efines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
d
efines
[
"NUM_BLOCKS"
]
=
cu
.
intToString
(
cu
.
getNumAtomBlocks
());
pairEnergyD
efines
[
"NUM_BLOCKS"
]
=
cu
.
intToString
(
cu
.
getNumAtomBlocks
());
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
cu
.
replace
String
s
(
Cuda
KernelSources
::
customGBEnergyN2
,
replacements
),
defines
);
pairEnergyDefines
[
"TILE_SIZE"
]
=
cu
.
intTo
String
(
Cuda
Context
::
TileSize
);
pairEnergy
Kernel
=
cu
.
getKernel
(
module
,
"computeN2Energy"
);
pairEnergy
Src
=
cu
.
replaceStrings
(
CudaKernelSources
::
customGBEnergyN2
,
replacements
);
}
}
{
{
// Create the kernel to reduce the derivatives and calculate per-particle energy terms.
// Create the kernel to reduce the derivatives and calculate per-particle energy terms.
...
@@ -2834,14 +2830,46 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
...
@@ -2834,14 +2830,46 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
CudaNonbondedUtilities
&
nb
=
cu
.
getNonbondedUtilities
();
CudaNonbondedUtilities
&
nb
=
cu
.
getNonbondedUtilities
();
if
(
!
hasInitializedKernels
)
{
if
(
!
hasInitializedKernels
)
{
hasInitializedKernels
=
true
;
hasInitializedKernels
=
true
;
// These two kernels can't be compiled in initialize(), because the nonbonded utilities object
// has not yet been initialized then.
{
int
numExclusionTiles
=
cu
.
getNonbondedUtilities
().
getExclusionTiles
().
getSize
();
pairValueDefines
[
"NUM_TILES_WITH_EXCLUSIONS"
]
=
cu
.
intToString
(
numExclusionTiles
);
int
numContexts
=
cu
.
getPlatformData
().
contexts
.
size
();
int
startExclusionIndex
=
cu
.
getContextIndex
()
*
numExclusionTiles
/
numContexts
;
int
endExclusionIndex
=
(
cu
.
getContextIndex
()
+
1
)
*
numExclusionTiles
/
numContexts
;
pairValueDefines
[
"FIRST_EXCLUSION_TILE"
]
=
cu
.
intToString
(
startExclusionIndex
);
pairValueDefines
[
"LAST_EXCLUSION_TILE"
]
=
cu
.
intToString
(
endExclusionIndex
);
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
pairValueSrc
,
pairValueDefines
);
pairValueKernel
=
cu
.
getKernel
(
module
,
"computeN2Value"
);
pairValueSrc
=
""
;
pairValueDefines
.
clear
();
}
{
int
numExclusionTiles
=
cu
.
getNonbondedUtilities
().
getExclusionTiles
().
getSize
();
pairEnergyDefines
[
"NUM_TILES_WITH_EXCLUSIONS"
]
=
cu
.
intToString
(
numExclusionTiles
);
int
numContexts
=
cu
.
getPlatformData
().
contexts
.
size
();
int
startExclusionIndex
=
cu
.
getContextIndex
()
*
numExclusionTiles
/
numContexts
;
int
endExclusionIndex
=
(
cu
.
getContextIndex
()
+
1
)
*
numExclusionTiles
/
numContexts
;
pairEnergyDefines
[
"FIRST_EXCLUSION_TILE"
]
=
cu
.
intToString
(
startExclusionIndex
);
pairEnergyDefines
[
"LAST_EXCLUSION_TILE"
]
=
cu
.
intToString
(
endExclusionIndex
);
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
pairEnergySrc
,
pairEnergyDefines
);
pairEnergyKernel
=
cu
.
getKernel
(
module
,
"computeN2Energy"
);
pairEnergySrc
=
""
;
pairEnergyDefines
.
clear
();
}
// Set arguments for kernels.
maxTiles
=
(
nb
.
getUseCutoff
()
?
nb
.
getInteractingTiles
().
getSize
()
:
cu
.
getNumAtomBlocks
()
*
(
cu
.
getNumAtomBlocks
()
+
1
)
/
2
);
maxTiles
=
(
nb
.
getUseCutoff
()
?
nb
.
getInteractingTiles
().
getSize
()
:
cu
.
getNumAtomBlocks
()
*
(
cu
.
getNumAtomBlocks
()
+
1
)
/
2
);
valueBuffers
=
CudaArray
::
create
<
long
long
>
(
cu
,
cu
.
getPaddedNumAtoms
(),
"customGBValueBuffers"
);
valueBuffers
=
CudaArray
::
create
<
long
long
>
(
cu
,
cu
.
getPaddedNumAtoms
(),
"customGBValueBuffers"
);
cu
.
addAutoclearBuffer
(
*
valueBuffers
);
cu
.
addAutoclearBuffer
(
*
valueBuffers
);
cu
.
clearBuffer
(
valueBuffers
->
getDevicePointer
(),
sizeof
(
long
long
)
*
valueBuffers
->
getSize
());
cu
.
clearBuffer
(
valueBuffers
->
getDevicePointer
(),
sizeof
(
long
long
)
*
valueBuffers
->
getSize
());
pairValueArgs
.
push_back
(
&
cu
.
getPosq
().
getDevicePointer
());
pairValueArgs
.
push_back
(
&
cu
.
getPosq
().
getDevicePointer
());
pairValueArgs
.
push_back
(
&
cu
.
getNonbondedUtilities
().
getExclusions
().
getDevicePointer
());
pairValueArgs
.
push_back
(
&
cu
.
getNonbondedUtilities
().
getExclusions
().
getDevicePointer
());
pairValueArgs
.
push_back
(
&
cu
.
getNonbondedUtilities
().
getExclusionIndices
().
getDevicePointer
());
pairValueArgs
.
push_back
(
&
cu
.
getNonbondedUtilities
().
getExclusionTiles
().
getDevicePointer
());
pairValueArgs
.
push_back
(
&
cu
.
getNonbondedUtilities
().
getExclusionRowIndices
().
getDevicePointer
());
pairValueArgs
.
push_back
(
&
valueBuffers
->
getDevicePointer
());
pairValueArgs
.
push_back
(
&
valueBuffers
->
getDevicePointer
());
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
pairValueArgs
.
push_back
(
&
nb
.
getInteractingTiles
().
getDevicePointer
());
pairValueArgs
.
push_back
(
&
nb
.
getInteractingTiles
().
getDevicePointer
());
...
@@ -2849,7 +2877,8 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
...
@@ -2849,7 +2877,8 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
pairValueArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
pairValueArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
pairValueArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
pairValueArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
pairValueArgs
.
push_back
(
&
maxTiles
);
pairValueArgs
.
push_back
(
&
maxTiles
);
pairValueArgs
.
push_back
(
&
nb
.
getInteractionFlags
().
getDevicePointer
());
pairValueArgs
.
push_back
(
&
nb
.
getBlockCenters
().
getDevicePointer
());
pairValueArgs
.
push_back
(
&
nb
.
getInteractingAtoms
().
getDevicePointer
());
}
}
else
else
pairValueArgs
.
push_back
(
&
maxTiles
);
pairValueArgs
.
push_back
(
&
maxTiles
);
...
@@ -2881,15 +2910,15 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
...
@@ -2881,15 +2910,15 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
pairEnergyArgs
.
push_back
(
&
cu
.
getEnergyBuffer
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
cu
.
getEnergyBuffer
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
cu
.
getPosq
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
cu
.
getPosq
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
cu
.
getNonbondedUtilities
().
getExclusions
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
cu
.
getNonbondedUtilities
().
getExclusions
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
cu
.
getNonbondedUtilities
().
getExclusionIndices
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
cu
.
getNonbondedUtilities
().
getExclusionTiles
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
cu
.
getNonbondedUtilities
().
getExclusionRowIndices
().
getDevicePointer
());
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
pairEnergyArgs
.
push_back
(
&
nb
.
getInteractingTiles
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
nb
.
getInteractingTiles
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
nb
.
getInteractionCount
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
nb
.
getInteractionCount
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
pairEnergyArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
pairEnergyArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
pairEnergyArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
pairEnergyArgs
.
push_back
(
&
maxTiles
);
pairEnergyArgs
.
push_back
(
&
maxTiles
);
pairEnergyArgs
.
push_back
(
&
nb
.
getInteractionFlags
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
nb
.
getBlockCenters
().
getDevicePointer
());
pairEnergyArgs
.
push_back
(
&
nb
.
getInteractingAtoms
().
getDevicePointer
());
}
}
else
else
pairEnergyArgs
.
push_back
(
&
maxTiles
);
pairEnergyArgs
.
push_back
(
&
maxTiles
);
...
@@ -2953,10 +2982,10 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
...
@@ -2953,10 +2982,10 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
if
(
nb
.
getUseCutoff
())
{
if
(
nb
.
getUseCutoff
())
{
if
(
maxTiles
<
nb
.
getInteractingTiles
().
getSize
())
{
if
(
maxTiles
<
nb
.
getInteractingTiles
().
getSize
())
{
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
pairValueArgs
[
5
]
=
&
nb
.
getInteractingTiles
().
getDevicePointer
();
pairValueArgs
[
4
]
=
&
nb
.
getInteractingTiles
().
getDevicePointer
();
pairEnergyArgs
[
6
]
=
&
nb
.
getInteractingTiles
().
getDevicePointer
();
pairEnergyArgs
[
5
]
=
&
nb
.
getInteractingTiles
().
getDevicePointer
();
pairValueArgs
[
10
]
=
&
nb
.
getInteracti
onFlag
s
().
getDevicePointer
();
pairValueArgs
[
10
]
=
&
nb
.
getInteracti
ngAtom
s
().
getDevicePointer
();
pairEnergyArgs
[
11
]
=
&
nb
.
getInteracti
onFlag
s
().
getDevicePointer
();
pairEnergyArgs
[
11
]
=
&
nb
.
getInteracti
ngAtom
s
().
getDevicePointer
();
}
}
}
}
cu
.
executeKernel
(
pairValueKernel
,
&
pairValueArgs
[
0
],
nb
.
getNumForceThreadBlocks
()
*
nb
.
getForceThreadBlockSize
(),
nb
.
getForceThreadBlockSize
());
cu
.
executeKernel
(
pairValueKernel
,
&
pairValueArgs
[
0
],
nb
.
getNumForceThreadBlocks
()
*
nb
.
getForceThreadBlockSize
(),
nb
.
getForceThreadBlockSize
());
...
@@ -2976,11 +3005,10 @@ void CudaCalcCustomGBForceKernel::copyParametersToContext(ContextImpl& context,
...
@@ -2976,11 +3005,10 @@ void CudaCalcCustomGBForceKernel::copyParametersToContext(ContextImpl& context,
// Record the per-particle parameters.
// Record the per-particle parameters.
vector
<
vector
<
float
>
>
paramVector
(
numParticles
);
vector
<
vector
<
float
>
>
paramVector
(
cu
.
getPaddedNumAtoms
(),
vector
<
float
>
(
force
.
getNumPerParticleParameters
(),
0
)
);
vector
<
double
>
parameters
;
vector
<
double
>
parameters
;
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
{
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
{
force
.
getParticleParameters
(
i
,
parameters
);
force
.
getParticleParameters
(
i
,
parameters
);
paramVector
[
i
].
resize
(
parameters
.
size
());
for
(
int
j
=
0
;
j
<
(
int
)
parameters
.
size
();
j
++
)
for
(
int
j
=
0
;
j
<
(
int
)
parameters
.
size
();
j
++
)
paramVector
[
i
][
j
]
=
(
float
)
parameters
[
j
];
paramVector
[
i
][
j
]
=
(
float
)
parameters
[
j
];
}
}
...
...
platforms/cuda/src/CudaKernels.h
View file @
93c467b2
...
@@ -557,8 +557,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
...
@@ -557,8 +557,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
public:
CudaCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CudaContext
&
cu
,
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
CudaCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CudaContext
&
cu
,
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
cu
(
cu
),
hasInitializedFFT
(
false
),
sigmaEpsilon
(
NULL
),
exceptionParams
(
NULL
),
cosSinSums
(
NULL
),
directPmeGrid
(
NULL
),
reciprocalPmeGrid
(
NULL
),
cu
(
cu
),
hasInitializedFFT
(
false
),
sigmaEpsilon
(
NULL
),
exceptionParams
(
NULL
),
cosSinSums
(
NULL
),
directPmeGrid
(
NULL
),
reciprocalPmeGrid
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeBsplineTheta
(
NULL
),
pmeBsplineDTheta
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
sort
(
NULL
)
{
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
sort
(
NULL
)
{
}
}
~
CudaCalcNonbondedForceKernel
();
~
CudaCalcNonbondedForceKernel
();
/**
/**
...
@@ -607,8 +606,6 @@ private:
...
@@ -607,8 +606,6 @@ private:
CudaArray
*
pmeBsplineModuliX
;
CudaArray
*
pmeBsplineModuliX
;
CudaArray
*
pmeBsplineModuliY
;
CudaArray
*
pmeBsplineModuliY
;
CudaArray
*
pmeBsplineModuliZ
;
CudaArray
*
pmeBsplineModuliZ
;
CudaArray
*
pmeBsplineTheta
;
CudaArray
*
pmeBsplineDTheta
;
CudaArray
*
pmeAtomRange
;
CudaArray
*
pmeAtomRange
;
CudaArray
*
pmeAtomGridIndex
;
CudaArray
*
pmeAtomGridIndex
;
CudaSort
*
sort
;
CudaSort
*
sort
;
...
@@ -617,9 +614,6 @@ private:
...
@@ -617,9 +614,6 @@ private:
CUfunction
ewaldSumsKernel
;
CUfunction
ewaldSumsKernel
;
CUfunction
ewaldForcesKernel
;
CUfunction
ewaldForcesKernel
;
CUfunction
pmeGridIndexKernel
;
CUfunction
pmeGridIndexKernel
;
CUfunction
pmeAtomRangeKernel
;
CUfunction
pmeZIndexKernel
;
CUfunction
pmeUpdateBsplinesKernel
;
CUfunction
pmeSpreadChargeKernel
;
CUfunction
pmeSpreadChargeKernel
;
CUfunction
pmeFinishSpreadChargeKernel
;
CUfunction
pmeFinishSpreadChargeKernel
;
CUfunction
pmeEvalEnergyKernel
;
CUfunction
pmeEvalEnergyKernel
;
...
@@ -776,6 +770,8 @@ private:
...
@@ -776,6 +770,8 @@ private:
System
&
system
;
System
&
system
;
CUfunction
pairValueKernel
,
perParticleValueKernel
,
pairEnergyKernel
,
perParticleEnergyKernel
,
gradientChainRuleKernel
;
CUfunction
pairValueKernel
,
perParticleValueKernel
,
pairEnergyKernel
,
perParticleEnergyKernel
,
gradientChainRuleKernel
;
std
::
vector
<
void
*>
pairValueArgs
,
perParticleValueArgs
,
pairEnergyArgs
,
perParticleEnergyArgs
,
gradientChainRuleArgs
;
std
::
vector
<
void
*>
pairValueArgs
,
perParticleValueArgs
,
pairEnergyArgs
,
perParticleEnergyArgs
,
gradientChainRuleArgs
;
std
::
string
pairValueSrc
,
pairEnergySrc
;
std
::
map
<
std
::
string
,
std
::
string
>
pairValueDefines
,
pairEnergyDefines
;
};
};
/**
/**
...
...
platforms/cuda/src/CudaNonbondedUtilities.cpp
View file @
93c467b2
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2009-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -29,6 +29,8 @@
...
@@ -29,6 +29,8 @@
#include "CudaArray.h"
#include "CudaArray.h"
#include "CudaKernelSources.h"
#include "CudaKernelSources.h"
#include "CudaExpressionUtilities.h"
#include "CudaExpressionUtilities.h"
#include "CudaSort.h"
#include <algorithm>
#include <map>
#include <map>
#include <set>
#include <set>
#include <utility>
#include <utility>
...
@@ -43,15 +45,33 @@ using namespace std;
...
@@ -43,15 +45,33 @@ using namespace std;
throw OpenMMException(m.str());\
throw OpenMMException(m.str());\
}
}
CudaNonbondedUtilities
::
CudaNonbondedUtilities
(
CudaContext
&
context
)
:
context
(
context
),
cutoff
(
-
1.0
),
useCutoff
(
false
),
anyExclusions
(
false
),
exclusionIndices
(
NULL
),
exclusionRowIndices
(
NULL
),
exclusions
(
NULL
),
interactingTiles
(
NULL
),
interactionFlags
(
NULL
),
class
CudaNonbondedUtilities
::
BlockSortTrait
:
public
CudaSort
::
SortTrait
{
interactionCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
),
nonbondedForceGroup
(
0
)
{
public:
BlockSortTrait
(
bool
useDouble
)
:
useDouble
(
useDouble
)
{
}
int
getDataSize
()
const
{
return
useDouble
?
sizeof
(
double2
)
:
sizeof
(
float2
);}
int
getKeySize
()
const
{
return
useDouble
?
sizeof
(
double
)
:
sizeof
(
float
);}
const
char
*
getDataType
()
const
{
return
"real2"
;}
const
char
*
getKeyType
()
const
{
return
"real"
;}
const
char
*
getMinKey
()
const
{
return
"-3.40282e+38f"
;}
const
char
*
getMaxKey
()
const
{
return
"3.40282e+38f"
;}
const
char
*
getMaxValue
()
const
{
return
"make_real2(3.40282e+38f, 3.40282e+38f)"
;}
const
char
*
getSortKey
()
const
{
return
"value.x"
;}
private:
bool
useDouble
;
};
CudaNonbondedUtilities
::
CudaNonbondedUtilities
(
CudaContext
&
context
)
:
context
(
context
),
cutoff
(
-
1.0
),
useCutoff
(
false
),
anyExclusions
(
false
),
usePadding
(
true
),
exclusionIndices
(
NULL
),
exclusionRowIndices
(
NULL
),
exclusionTiles
(
NULL
),
exclusions
(
NULL
),
interactingTiles
(
NULL
),
interactingAtoms
(
NULL
),
interactionCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
),
sortedBlocks
(
NULL
),
sortedBlockCenter
(
NULL
),
sortedBlockBoundingBox
(
NULL
),
oldPositions
(
NULL
),
rebuildNeighborList
(
NULL
),
blockSorter
(
NULL
),
nonbondedForceGroup
(
0
)
{
// Decide how many thread blocks to use.
// Decide how many thread blocks to use.
string
errorMessage
=
"Error initializing nonbonded utilities"
;
string
errorMessage
=
"Error initializing nonbonded utilities"
;
int
multiprocessors
;
int
multiprocessors
;
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
context
.
getDevice
()));
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
context
.
getDevice
()));
numForceThreadBlocks
=
3
*
multiprocessors
;
numForceThreadBlocks
=
4
*
multiprocessors
;
forceThreadBlockSize
=
(
context
.
getComputeCapability
()
<
2.0
?
128
:
256
);
forceThreadBlockSize
=
(
context
.
getComputeCapability
()
<
2.0
?
128
:
256
);
}
}
...
@@ -60,18 +80,32 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
...
@@ -60,18 +80,32 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
delete
exclusionIndices
;
delete
exclusionIndices
;
if
(
exclusionRowIndices
!=
NULL
)
if
(
exclusionRowIndices
!=
NULL
)
delete
exclusionRowIndices
;
delete
exclusionRowIndices
;
if
(
exclusionTiles
!=
NULL
)
delete
exclusionTiles
;
if
(
exclusions
!=
NULL
)
if
(
exclusions
!=
NULL
)
delete
exclusions
;
delete
exclusions
;
if
(
interactingTiles
!=
NULL
)
if
(
interactingTiles
!=
NULL
)
delete
interactingTiles
;
delete
interactingTiles
;
if
(
interacti
onFlag
s
!=
NULL
)
if
(
interacti
ngAtom
s
!=
NULL
)
delete
interacti
onFlag
s
;
delete
interacti
ngAtom
s
;
if
(
interactionCount
!=
NULL
)
if
(
interactionCount
!=
NULL
)
delete
interactionCount
;
delete
interactionCount
;
if
(
blockCenter
!=
NULL
)
if
(
blockCenter
!=
NULL
)
delete
blockCenter
;
delete
blockCenter
;
if
(
blockBoundingBox
!=
NULL
)
if
(
blockBoundingBox
!=
NULL
)
delete
blockBoundingBox
;
delete
blockBoundingBox
;
if
(
sortedBlocks
!=
NULL
)
delete
sortedBlocks
;
if
(
sortedBlockCenter
!=
NULL
)
delete
sortedBlockCenter
;
if
(
sortedBlockBoundingBox
!=
NULL
)
delete
sortedBlockBoundingBox
;
if
(
oldPositions
!=
NULL
)
delete
oldPositions
;
if
(
rebuildNeighborList
!=
NULL
)
delete
rebuildNeighborList
;
if
(
blockSorter
!=
NULL
)
delete
blockSorter
;
}
}
void
CudaNonbondedUtilities
::
addInteraction
(
bool
usesCutoff
,
bool
usesPeriodic
,
bool
usesExclusions
,
double
cutoffDistance
,
const
vector
<
vector
<
int
>
>&
exclusionList
,
const
string
&
kernel
,
int
forceGroup
)
{
void
CudaNonbondedUtilities
::
addInteraction
(
bool
usesCutoff
,
bool
usesPeriodic
,
bool
usesExclusions
,
double
cutoffDistance
,
const
vector
<
vector
<
int
>
>&
exclusionList
,
const
string
&
kernel
,
int
forceGroup
)
{
...
@@ -124,6 +158,10 @@ void CudaNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclu
...
@@ -124,6 +158,10 @@ void CudaNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclu
}
}
}
}
static
bool
compareUshort2
(
ushort2
a
,
ushort2
b
)
{
return
((
a
.
y
<
b
.
y
)
||
(
a
.
y
==
b
.
y
&&
a
.
x
<
b
.
x
));
}
void
CudaNonbondedUtilities
::
initialize
(
const
System
&
system
)
{
void
CudaNonbondedUtilities
::
initialize
(
const
System
&
system
)
{
string
errorMessage
=
"Error initializing nonbonded utilities"
;
string
errorMessage
=
"Error initializing nonbonded utilities"
;
if
(
atomExclusions
.
size
()
==
0
)
{
if
(
atomExclusions
.
size
()
==
0
)
{
...
@@ -138,13 +176,10 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -138,13 +176,10 @@ void CudaNonbondedUtilities::initialize(const System& system) {
numAtoms
=
context
.
getNumAtoms
();
numAtoms
=
context
.
getNumAtoms
();
int
numAtomBlocks
=
context
.
getNumAtomBlocks
();
int
numAtomBlocks
=
context
.
getNumAtomBlocks
();
int
totalTiles
=
numAtomBlocks
*
(
numAtomBlocks
+
1
)
/
2
;
int
numContexts
=
context
.
getPlatformData
().
contexts
.
size
();
int
numContexts
=
context
.
getPlatformData
().
contexts
.
size
();
startTileIndex
=
context
.
getContextIndex
()
*
totalTiles
/
numContexts
;
setAtomBlockRange
(
context
.
getContextIndex
()
/
(
double
)
numContexts
,
(
context
.
getContextIndex
()
+
1
)
/
(
double
)
numContexts
);
int
endTileIndex
=
(
context
.
getContextIndex
()
+
1
)
*
totalTiles
/
numContexts
;
numTiles
=
endTileIndex
-
startTileIndex
;
// Build a list of
indices for the tiles with
exclusions.
// Build a list of
tiles that contain
exclusions.
set
<
pair
<
int
,
int
>
>
tilesWithExclusions
;
set
<
pair
<
int
,
int
>
>
tilesWithExclusions
;
for
(
int
atom1
=
0
;
atom1
<
(
int
)
atomExclusions
.
size
();
++
atom1
)
{
for
(
int
atom1
=
0
;
atom1
<
(
int
)
atomExclusions
.
size
();
++
atom1
)
{
...
@@ -155,19 +190,29 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -155,19 +190,29 @@ void CudaNonbondedUtilities::initialize(const System& system) {
tilesWithExclusions
.
insert
(
make_pair
(
max
(
x
,
y
),
min
(
x
,
y
)));
tilesWithExclusions
.
insert
(
make_pair
(
max
(
x
,
y
),
min
(
x
,
y
)));
}
}
}
}
if
(
context
.
getPaddedNumAtoms
()
>
context
.
getNumAtoms
())
{
vector
<
ushort2
>
exclusionTilesVec
;
for
(
int
i
=
0
;
i
<
numAtomBlocks
;
++
i
)
for
(
set
<
pair
<
int
,
int
>
>::
const_iterator
iter
=
tilesWithExclusions
.
begin
();
iter
!=
tilesWithExclusions
.
end
();
++
iter
)
tilesWithExclusions
.
insert
(
make_pair
(
numAtomBlocks
-
1
,
i
));
exclusionTilesVec
.
push_back
(
make_ushort2
((
unsigned
short
)
iter
->
first
,
(
unsigned
short
)
iter
->
second
));
sort
(
exclusionTilesVec
.
begin
(),
exclusionTilesVec
.
end
(),
compareUshort2
);
exclusionTiles
=
CudaArray
::
create
<
ushort2
>
(
context
,
exclusionTilesVec
.
size
(),
"exclusionTiles"
);
exclusionTiles
->
upload
(
exclusionTilesVec
);
map
<
pair
<
int
,
int
>
,
int
>
exclusionTileMap
;
for
(
int
i
=
0
;
i
<
(
int
)
exclusionTilesVec
.
size
();
i
++
)
{
ushort2
tile
=
exclusionTilesVec
[
i
];
exclusionTileMap
[
make_pair
(
tile
.
x
,
tile
.
y
)]
=
i
;
}
vector
<
vector
<
int
>
>
exclusionBlocksForBlock
(
numAtomBlocks
);
for
(
set
<
pair
<
int
,
int
>
>::
const_iterator
iter
=
tilesWithExclusions
.
begin
();
iter
!=
tilesWithExclusions
.
end
();
++
iter
)
{
exclusionBlocksForBlock
[
iter
->
first
].
push_back
(
iter
->
second
);
if
(
iter
->
first
!=
iter
->
second
)
exclusionBlocksForBlock
[
iter
->
second
].
push_back
(
iter
->
first
);
}
}
vector
<
unsigned
int
>
exclusionRowIndicesVec
(
numAtomBlocks
+
1
,
0
);
vector
<
unsigned
int
>
exclusionRowIndicesVec
(
numAtomBlocks
+
1
,
0
);
vector
<
unsigned
int
>
exclusionIndicesVec
;
vector
<
unsigned
int
>
exclusionIndicesVec
;
int
currentRow
=
0
;
for
(
int
i
=
0
;
i
<
numAtomBlocks
;
i
++
)
{
for
(
set
<
pair
<
int
,
int
>
>::
const_iterator
iter
=
tilesWithExclusions
.
begin
();
iter
!=
tilesWithExclusions
.
end
();
++
iter
)
{
exclusionIndicesVec
.
insert
(
exclusionIndicesVec
.
end
(),
exclusionBlocksForBlock
[
i
].
begin
(),
exclusionBlocksForBlock
[
i
].
end
());
while
(
iter
->
first
!=
currentRow
)
exclusionRowIndicesVec
[
i
+
1
]
=
exclusionIndicesVec
.
size
();
exclusionRowIndicesVec
[
++
currentRow
]
=
exclusionIndicesVec
.
size
();
exclusionIndicesVec
.
push_back
(
iter
->
second
);
}
}
exclusionRowIndicesVec
[
++
currentRow
]
=
exclusionIndicesVec
.
size
();
exclusionIndices
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
exclusionIndicesVec
.
size
(),
"exclusionIndices"
);
exclusionIndices
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
exclusionIndicesVec
.
size
(),
"exclusionIndices"
);
exclusionRowIndices
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
exclusionRowIndicesVec
.
size
(),
"exclusionRowIndices"
);
exclusionRowIndices
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
exclusionRowIndicesVec
.
size
(),
"exclusionRowIndices"
);
exclusionIndices
->
upload
(
exclusionIndicesVec
);
exclusionIndices
->
upload
(
exclusionIndicesVec
);
...
@@ -175,8 +220,9 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -175,8 +220,9 @@ void CudaNonbondedUtilities::initialize(const System& system) {
// Record the exclusion data.
// Record the exclusion data.
exclusions
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
tilesWithExclusions
.
size
()
*
CudaContext
::
TileSize
,
"exclusions"
);
exclusions
=
CudaArray
::
create
<
tileflags
>
(
context
,
tilesWithExclusions
.
size
()
*
CudaContext
::
TileSize
,
"exclusions"
);
vector
<
unsigned
int
>
exclusionVec
(
exclusions
->
getSize
(),
0xFFFFFFFF
);
tileflags
allFlags
=
(
tileflags
)
-
1
;
vector
<
tileflags
>
exclusionVec
(
exclusions
->
getSize
(),
allFlags
);
for
(
int
atom1
=
0
;
atom1
<
(
int
)
atomExclusions
.
size
();
++
atom1
)
{
for
(
int
atom1
=
0
;
atom1
<
(
int
)
atomExclusions
.
size
();
++
atom1
)
{
int
x
=
atom1
/
CudaContext
::
TileSize
;
int
x
=
atom1
/
CudaContext
::
TileSize
;
int
offset1
=
atom1
-
x
*
CudaContext
::
TileSize
;
int
offset1
=
atom1
-
x
*
CudaContext
::
TileSize
;
...
@@ -185,31 +231,12 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -185,31 +231,12 @@ void CudaNonbondedUtilities::initialize(const System& system) {
int
y
=
atom2
/
CudaContext
::
TileSize
;
int
y
=
atom2
/
CudaContext
::
TileSize
;
int
offset2
=
atom2
-
y
*
CudaContext
::
TileSize
;
int
offset2
=
atom2
-
y
*
CudaContext
::
TileSize
;
if
(
x
>
y
)
{
if
(
x
>
y
)
{
int
index
=
findE
xclusion
Index
(
x
,
y
,
exclusionIndicesVec
,
exclusionRowIndicesVec
)
;
int
index
=
e
xclusion
TileMap
[
make_pair
(
x
,
y
)]
*
CudaContext
::
TileSize
;
exclusionVec
[
index
+
offset1
]
&=
0xFFFFFFFF
-
(
1
<<
offset2
);
exclusionVec
[
index
+
offset1
]
&=
allFlags
-
(
1
<<
offset2
);
}
}
else
{
else
{
int
index
=
findExclusionIndex
(
y
,
x
,
exclusionIndicesVec
,
exclusionRowIndicesVec
);
int
index
=
exclusionTileMap
[
make_pair
(
y
,
x
)]
*
CudaContext
::
TileSize
;
exclusionVec
[
index
+
offset2
]
&=
0xFFFFFFFF
-
(
1
<<
offset1
);
exclusionVec
[
index
+
offset2
]
&=
allFlags
-
(
1
<<
offset1
);
}
}
}
// Mark all interactions that involve a padding atom as being excluded.
for
(
int
atom1
=
context
.
getNumAtoms
();
atom1
<
context
.
getPaddedNumAtoms
();
++
atom1
)
{
int
x
=
atom1
/
CudaContext
::
TileSize
;
int
offset1
=
atom1
-
x
*
CudaContext
::
TileSize
;
for
(
int
atom2
=
0
;
atom2
<
context
.
getPaddedNumAtoms
();
++
atom2
)
{
int
y
=
atom2
/
CudaContext
::
TileSize
;
int
offset2
=
atom2
-
y
*
CudaContext
::
TileSize
;
if
(
x
>=
y
)
{
int
index
=
findExclusionIndex
(
x
,
y
,
exclusionIndicesVec
,
exclusionRowIndicesVec
);
exclusionVec
[
index
+
offset1
]
&=
0xFFFFFFFF
-
(
1
<<
offset2
);
}
if
(
y
>=
x
)
{
int
index
=
findExclusionIndex
(
y
,
x
,
exclusionIndicesVec
,
exclusionRowIndicesVec
);
exclusionVec
[
index
+
offset2
]
&=
0xFFFFFFFF
-
(
1
<<
offset1
);
}
}
}
}
}
}
...
@@ -219,26 +246,34 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -219,26 +246,34 @@ void CudaNonbondedUtilities::initialize(const System& system) {
// Create data structures for the neighbor list.
// Create data structures for the neighbor list.
if
(
useCutoff
)
{
if
(
useCutoff
)
{
// Select a size for the arrays that hold the neighbor list.
This esti
ma
t
e
is intentionally ver
y
// Select a size for the arrays that hold the neighbor list.
We have to
ma
k
e
a fairl
y
//
high, because if it ever is
too small
,
we
have to fall back to the N^2 algorithm
.
//
arbitrary guess, but if this turns out to be
too small we
'll increase it later
.
double4
boxSize
=
context
.
getPeriodicBoxSize
();
maxTiles
=
20
*
numAtomBlocks
;
maxTiles
=
(
int
)
(
numTiles
*
(
cutoff
/
boxSize
.
x
+
cutoff
/
boxSize
.
y
+
cutoff
/
boxSize
.
z
));
if
(
maxTiles
>
numTiles
)
if
(
maxTiles
>
numTiles
)
maxTiles
=
numTiles
;
maxTiles
=
numTiles
;
if
(
maxTiles
<
1
)
if
(
maxTiles
<
1
)
maxTiles
=
1
;
maxTiles
=
1
;
interactingTiles
=
CudaArray
::
create
<
ushort2
>
(
context
,
maxTiles
,
"interactingTiles"
);
interactingTiles
=
CudaArray
::
create
<
ushort2
>
(
context
,
maxTiles
,
"interactingTiles"
);
interacti
onFlag
s
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
maxTiles
,
"interacti
onFlag
s"
);
interacti
ngAtom
s
=
CudaArray
::
create
<
int
>
(
context
,
CudaContext
::
TileSize
*
maxTiles
,
"interacti
ngAtom
s"
);
interactionCount
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
1
,
"interactionCount"
);
interactionCount
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
1
,
"interactionCount"
);
int
elementSize
=
(
context
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
blockCenter
=
new
CudaArray
(
context
,
numAtomBlocks
,
4
*
elementSize
,
"blockCenter"
);
blockBoundingBox
=
new
CudaArray
(
context
,
numAtomBlocks
,
4
*
elementSize
,
"blockBoundingBox"
);
sortedBlocks
=
new
CudaArray
(
context
,
numAtomBlocks
,
2
*
elementSize
,
"sortedBlocks"
);
sortedBlockCenter
=
new
CudaArray
(
context
,
numAtomBlocks
+
1
,
4
*
elementSize
,
"sortedBlockCenter"
);
sortedBlockBoundingBox
=
new
CudaArray
(
context
,
numAtomBlocks
+
1
,
4
*
elementSize
,
"sortedBlockBoundingBox"
);
oldPositions
=
new
CudaArray
(
context
,
numAtoms
,
4
*
elementSize
,
"oldPositions"
);
if
(
context
.
getUseDoublePrecision
())
{
if
(
context
.
getUseDoublePrecision
())
{
blockCenter
=
CudaArray
::
create
<
double4
>
(
context
,
numAtomBlocks
,
"blockCenter"
);
vector
<
double4
>
oldPositionsVec
(
numAtoms
,
make_double4
(
1e30
,
1e30
,
1e30
,
0
)
);
blockBoundingBox
=
CudaArray
::
create
<
double4
>
(
context
,
numAtomBlocks
,
"blockBoundingBox"
);
oldPositions
->
upload
(
oldPositionsVec
);
}
}
else
{
else
{
blockCenter
=
CudaArray
::
create
<
float4
>
(
context
,
numAtomBlocks
,
"blockCenter"
);
vector
<
float4
>
oldPositionsVec
(
numAtoms
,
make_float4
(
1e30
f
,
1e30
f
,
1e30
f
,
0
)
);
blockBoundingBox
=
CudaArray
::
create
<
float4
>
(
context
,
numAtomBlocks
,
"blockBoundingBox"
);
oldPositions
->
upload
(
oldPositionsVec
);
}
}
rebuildNeighborList
=
CudaArray
::
create
<
int
>
(
context
,
1
,
"rebuildNeighborList"
);
blockSorter
=
new
CudaSort
(
context
,
new
BlockSortTrait
(
context
.
getUseDoublePrecision
()),
numAtomBlocks
);
vector
<
unsigned
int
>
count
(
1
,
0
);
vector
<
unsigned
int
>
count
(
1
,
0
);
interactionCount
->
upload
(
count
);
interactionCount
->
upload
(
count
);
}
}
...
@@ -248,11 +283,22 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -248,11 +283,22 @@ void CudaNonbondedUtilities::initialize(const System& system) {
if
(
kernelSource
.
size
()
>
0
)
if
(
kernelSource
.
size
()
>
0
)
forceKernel
=
createInteractionKernel
(
kernelSource
,
parameters
,
arguments
,
true
,
true
);
forceKernel
=
createInteractionKernel
(
kernelSource
,
parameters
,
arguments
,
true
,
true
);
if
(
useCutoff
)
{
if
(
useCutoff
)
{
double
padding
=
(
usePadding
?
0.1
*
cutoff
:
0.0
);
double
paddedCutoff
=
cutoff
+
padding
;
map
<
string
,
string
>
defines
;
map
<
string
,
string
>
defines
;
defines
[
"TILE_SIZE"
]
=
context
.
intToString
(
CudaContext
::
TileSize
);
defines
[
"NUM_BLOCKS"
]
=
context
.
intToString
(
context
.
getNumAtomBlocks
());
defines
[
"NUM_BLOCKS"
]
=
context
.
intToString
(
context
.
getNumAtomBlocks
());
defines
[
"CUTOFF_SQUARED"
]
=
context
.
doubleToString
(
cutoff
*
cutoff
);
defines
[
"NUM_ATOMS"
]
=
context
.
intToString
(
context
.
getNumAtoms
());
defines
[
"PADDING"
]
=
context
.
doubleToString
(
padding
);
defines
[
"PADDED_CUTOFF"
]
=
context
.
doubleToString
(
paddedCutoff
);
defines
[
"PADDED_CUTOFF_SQUARED"
]
=
context
.
doubleToString
(
paddedCutoff
*
paddedCutoff
);
defines
[
"NUM_TILES_WITH_EXCLUSIONS"
]
=
context
.
intToString
(
exclusionTiles
->
getSize
());
if
(
usePeriodic
)
if
(
usePeriodic
)
defines
[
"USE_PERIODIC"
]
=
"1"
;
defines
[
"USE_PERIODIC"
]
=
"1"
;
int
maxExclusions
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
exclusionBlocksForBlock
.
size
();
i
++
)
maxExclusions
=
(
maxExclusions
>
exclusionBlocksForBlock
[
i
].
size
()
?
maxExclusions
:
exclusionBlocksForBlock
[
i
].
size
());
defines
[
"MAX_EXCLUSIONS"
]
=
context
.
intToString
(
maxExclusions
);
CUmodule
interactingBlocksProgram
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
findInteractingBlocks
,
defines
);
CUmodule
interactingBlocksProgram
=
context
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
findInteractingBlocks
,
defines
);
findBlockBoundsKernel
=
context
.
getKernel
(
interactingBlocksProgram
,
"findBlockBounds"
);
findBlockBoundsKernel
=
context
.
getKernel
(
interactingBlocksProgram
,
"findBlockBounds"
);
findBlockBoundsArgs
.
push_back
(
&
numAtoms
);
findBlockBoundsArgs
.
push_back
(
&
numAtoms
);
...
@@ -261,7 +307,18 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -261,7 +307,18 @@ void CudaNonbondedUtilities::initialize(const System& system) {
findBlockBoundsArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
findBlockBoundsArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
findBlockBoundsArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
findBlockBoundsArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
findBlockBoundsArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
findBlockBoundsArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
findBlockBoundsArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
findBlockBoundsArgs
.
push_back
(
&
rebuildNeighborList
->
getDevicePointer
());
findBlockBoundsArgs
.
push_back
(
&
sortedBlocks
->
getDevicePointer
());
sortBoxDataKernel
=
context
.
getKernel
(
interactingBlocksProgram
,
"sortBoxData"
);
sortBoxDataArgs
.
push_back
(
&
sortedBlocks
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
sortedBlockCenter
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
sortedBlockBoundingBox
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
oldPositions
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
sortBoxDataArgs
.
push_back
(
&
rebuildNeighborList
->
getDevicePointer
());
findInteractingBlocksKernel
=
context
.
getKernel
(
interactingBlocksProgram
,
"findBlocksWithInteractions"
);
findInteractingBlocksKernel
=
context
.
getKernel
(
interactingBlocksProgram
,
"findBlocksWithInteractions"
);
findInteractingBlocksArgs
.
push_back
(
context
.
getPeriodicBoxSizePointer
());
findInteractingBlocksArgs
.
push_back
(
context
.
getPeriodicBoxSizePointer
());
findInteractingBlocksArgs
.
push_back
(
context
.
getInvPeriodicBoxSizePointer
());
findInteractingBlocksArgs
.
push_back
(
context
.
getInvPeriodicBoxSizePointer
());
...
@@ -269,35 +326,21 @@ void CudaNonbondedUtilities::initialize(const System& system) {
...
@@ -269,35 +326,21 @@ void CudaNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactingTiles
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interactingTiles
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interacti
onFlag
s
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
interacti
ngAtom
s
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
maxTiles
);
findInteractingBlocksArgs
.
push_back
(
&
maxTiles
);
findInteractingBlocksArgs
.
push_back
(
&
startTileIndex
);
findInteractingBlocksArgs
.
push_back
(
&
startBlockIndex
);
findInteractingBlocksArgs
.
push_back
(
&
numTiles
);
findInteractingBlocksArgs
.
push_back
(
&
numBlocks
);
findInteractionsWithinBlocksKernel
=
context
.
getKernel
(
interactingBlocksProgram
,
"findInteractionsWithinBlocks"
);
findInteractingBlocksArgs
.
push_back
(
&
sortedBlocks
->
getDevicePointer
());
findInteractionsWithinBlocksArgs
.
push_back
(
context
.
getPeriodicBoxSizePointer
());
findInteractingBlocksArgs
.
push_back
(
&
sortedBlockCenter
->
getDevicePointer
());
findInteractionsWithinBlocksArgs
.
push_back
(
context
.
getInvPeriodicBoxSizePointer
());
findInteractingBlocksArgs
.
push_back
(
&
sortedBlockBoundingBox
->
getDevicePointer
());
findInteractionsWithinBlocksArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
exclusionIndices
->
getDevicePointer
());
findInteractionsWithinBlocksArgs
.
push_back
(
&
interactingTiles
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
exclusionRowIndices
->
getDevicePointer
());
findInteractionsWithinBlocksArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
oldPositions
->
getDevicePointer
());
findInteractionsWithinBlocksArgs
.
push_back
(
&
blockBoundingBox
->
getDevicePointer
());
findInteractingBlocksArgs
.
push_back
(
&
rebuildNeighborList
->
getDevicePointer
());
findInteractionsWithinBlocksArgs
.
push_back
(
&
interactionFlags
->
getDevicePointer
());
findInteractionsWithinBlocksArgs
.
push_back
(
&
interactionCount
->
getDevicePointer
());
findInteractionsWithinBlocksArgs
.
push_back
(
&
maxTiles
);
}
}
}
}
int
CudaNonbondedUtilities
::
findExclusionIndex
(
int
x
,
int
y
,
const
vector
<
unsigned
int
>&
exclusionIndices
,
const
vector
<
unsigned
int
>&
exclusionRowIndices
)
{
if
(
x
<
y
)
throw
OpenMMException
(
"Internal error: called findExclusionIndex with x<y"
);
int
start
=
exclusionRowIndices
[
x
];
int
end
=
exclusionRowIndices
[
x
+
1
];
for
(
int
i
=
start
;
i
<
end
;
i
++
)
if
(
exclusionIndices
[
i
]
==
y
)
return
i
*
CudaContext
::
TileSize
;
throw
OpenMMException
(
"Internal error: exclusion in unexpected tile"
);
}
void
CudaNonbondedUtilities
::
prepareInteractions
()
{
void
CudaNonbondedUtilities
::
prepareInteractions
()
{
if
(
!
useCutoff
)
if
(
!
useCutoff
)
return
;
return
;
...
@@ -311,13 +354,17 @@ void CudaNonbondedUtilities::prepareInteractions() {
...
@@ -311,13 +354,17 @@ void CudaNonbondedUtilities::prepareInteractions() {
// Compute the neighbor list.
// Compute the neighbor list.
context
.
executeKernel
(
findBlockBoundsKernel
,
&
findBlockBoundsArgs
[
0
],
context
.
getNumAtoms
());
context
.
executeKernel
(
findBlockBoundsKernel
,
&
findBlockBoundsArgs
[
0
],
context
.
getNumAtoms
());
context
.
executeKernel
(
findInteractingBlocksKernel
,
&
findInteractingBlocksArgs
[
0
],
context
.
getNumAtoms
());
blockSorter
->
sort
(
*
sortedBlocks
);
context
.
executeKernel
(
findInteractionsWithinBlocksKernel
,
&
findInteractionsWithinBlocksArgs
[
0
],
context
.
getNumAtoms
(),
128
);
context
.
executeKernel
(
sortBoxDataKernel
,
&
sortBoxDataArgs
[
0
],
context
.
getNumAtoms
());
context
.
executeKernel
(
findInteractingBlocksKernel
,
&
findInteractingBlocksArgs
[
0
],
context
.
getNumAtoms
(),
256
);
}
}
void
CudaNonbondedUtilities
::
computeInteractions
()
{
void
CudaNonbondedUtilities
::
computeInteractions
()
{
if
(
kernelSource
.
size
()
>
0
)
if
(
kernelSource
.
size
()
>
0
)
{
context
.
executeKernel
(
forceKernel
,
&
forceArgs
[
0
],
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
context
.
executeKernel
(
forceKernel
,
&
forceArgs
[
0
],
numForceThreadBlocks
*
forceThreadBlockSize
,
forceThreadBlockSize
);
if
(
context
.
getComputeForceCount
()
==
1
)
updateNeighborListSize
();
// This is the first time step, so check whether our initial guess was large enough.
}
}
}
void
CudaNonbondedUtilities
::
updateNeighborListSize
()
{
void
CudaNonbondedUtilities
::
updateNeighborListSize
()
{
...
@@ -332,26 +379,42 @@ void CudaNonbondedUtilities::updateNeighborListSize() {
...
@@ -332,26 +379,42 @@ void CudaNonbondedUtilities::updateNeighborListSize() {
// this from happening in the future.
// this from happening in the future.
maxTiles
=
(
int
)
(
1.2
*
pinnedInteractionCount
[
0
]);
maxTiles
=
(
int
)
(
1.2
*
pinnedInteractionCount
[
0
]);
int
num
Tiles
=
context
.
getNumAtomBlocks
()
*
(
context
.
getNumAtomBlocks
()
+
1
)
/
2
;
int
total
Tiles
=
context
.
getNumAtomBlocks
()
*
(
context
.
getNumAtomBlocks
()
+
1
)
/
2
;
if
(
maxTiles
>
num
Tiles
)
if
(
maxTiles
>
total
Tiles
)
maxTiles
=
num
Tiles
;
maxTiles
=
total
Tiles
;
delete
interactingTiles
;
delete
interactingTiles
;
delete
interactingAtoms
;
interactingTiles
=
NULL
;
// Avoid an error in the destructor if the following allocation fails
interactingAtoms
=
NULL
;
interactingTiles
=
CudaArray
::
create
<
ushort2
>
(
context
,
maxTiles
,
"interactingTiles"
);
interactingTiles
=
CudaArray
::
create
<
ushort2
>
(
context
,
maxTiles
,
"interactingTiles"
);
interactingAtoms
=
CudaArray
::
create
<
int
>
(
context
,
CudaContext
::
TileSize
*
maxTiles
,
"interactingAtoms"
);
if
(
forceArgs
.
size
()
>
0
)
if
(
forceArgs
.
size
()
>
0
)
forceArgs
[
8
]
=
&
interactingTiles
->
getDevicePointer
();
forceArgs
[
7
]
=
&
interactingTiles
->
getDevicePointer
();
findInteractingBlocksArgs
[
5
]
=
&
interactingTiles
->
getDevicePointer
();
findInteractingBlocksArgs
[
5
]
=
&
interactingTiles
->
getDevicePointer
();
delete
interactionFlags
;
interactionFlags
=
CudaArray
::
create
<
unsigned
int
>
(
context
,
maxTiles
,
"interactionFlags"
);
if
(
forceArgs
.
size
()
>
0
)
if
(
forceArgs
.
size
()
>
0
)
forceArgs
[
13
]
=
&
interactionFlags
->
getDevicePointer
();
forceArgs
[
13
]
=
&
interactingAtoms
->
getDevicePointer
();
findInteractingBlocksArgs
[
6
]
=
&
interactionFlags
->
getDevicePointer
();
findInteractingBlocksArgs
[
6
]
=
&
interactingAtoms
->
getDevicePointer
();
findInteractionsWithinBlocksArgs
[
3
]
=
&
interactingTiles
->
getDevicePointer
();
if
(
context
.
getUseDoublePrecision
())
{
findInteractionsWithinBlocksArgs
[
6
]
=
&
interactionFlags
->
getDevicePointer
();
vector
<
double4
>
oldPositionsVec
(
numAtoms
,
make_double4
(
1e30
,
1e30
,
1e30
,
0
));
oldPositions
->
upload
(
oldPositionsVec
);
}
else
{
vector
<
float4
>
oldPositionsVec
(
numAtoms
,
make_float4
(
1e30
f
,
1e30
f
,
1e30
f
,
0
));
oldPositions
->
upload
(
oldPositionsVec
);
}
}
}
void
CudaNonbondedUtilities
::
setTileRange
(
int
startTileIndex
,
int
numTiles
)
{
void
CudaNonbondedUtilities
::
setUsePadding
(
bool
padding
)
{
this
->
startTileIndex
=
startTileIndex
;
usePadding
=
padding
;
this
->
numTiles
=
numTiles
;
}
void
CudaNonbondedUtilities
::
setAtomBlockRange
(
double
startFraction
,
double
endFraction
)
{
int
numAtomBlocks
=
context
.
getNumAtomBlocks
();
startBlockIndex
=
(
int
)
(
startFraction
*
numAtomBlocks
);
numBlocks
=
(
int
)
(
endFraction
*
numAtomBlocks
)
-
startBlockIndex
;
int
totalTiles
=
context
.
getNumAtomBlocks
()
*
(
context
.
getNumAtomBlocks
()
+
1
)
/
2
;
startTileIndex
=
(
int
)
(
startFraction
*
totalTiles
);;
numTiles
=
(
int
)
(
endFraction
*
totalTiles
)
-
startTileIndex
;
}
}
CUfunction
CudaNonbondedUtilities
::
createInteractionKernel
(
const
string
&
source
,
vector
<
ParameterInfo
>&
params
,
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
)
{
CUfunction
CudaNonbondedUtilities
::
createInteractionKernel
(
const
string
&
source
,
vector
<
ParameterInfo
>&
params
,
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
)
{
...
@@ -447,6 +510,14 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
...
@@ -447,6 +510,14 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
defines
[
"NUM_ATOMS"
]
=
context
.
intToString
(
context
.
getNumAtoms
());
defines
[
"NUM_ATOMS"
]
=
context
.
intToString
(
context
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
context
.
intToString
(
context
.
getPaddedNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
context
.
intToString
(
context
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
context
.
intToString
(
context
.
getNumAtomBlocks
());
defines
[
"NUM_BLOCKS"
]
=
context
.
intToString
(
context
.
getNumAtomBlocks
());
defines
[
"TILE_SIZE"
]
=
context
.
intToString
(
CudaContext
::
TileSize
);
int
numExclusionTiles
=
exclusionTiles
->
getSize
();
defines
[
"NUM_TILES_WITH_EXCLUSIONS"
]
=
context
.
intToString
(
numExclusionTiles
);
int
numContexts
=
context
.
getPlatformData
().
contexts
.
size
();
int
startExclusionIndex
=
context
.
getContextIndex
()
*
numExclusionTiles
/
numContexts
;
int
endExclusionIndex
=
(
context
.
getContextIndex
()
+
1
)
*
numExclusionTiles
/
numContexts
;
defines
[
"FIRST_EXCLUSION_TILE"
]
=
context
.
intToString
(
startExclusionIndex
);
defines
[
"LAST_EXCLUSION_TILE"
]
=
context
.
intToString
(
endExclusionIndex
);
if
((
localDataSize
/
4
)
%
2
==
0
&&
!
context
.
getUseDoublePrecision
())
if
((
localDataSize
/
4
)
%
2
==
0
&&
!
context
.
getUseDoublePrecision
())
defines
[
"PARAMETER_SIZE_IS_EVEN"
]
=
"1"
;
defines
[
"PARAMETER_SIZE_IS_EVEN"
]
=
"1"
;
if
(
context
.
getComputeCapability
()
>=
3.0
&&
!
context
.
getUseDoublePrecision
())
if
(
context
.
getComputeCapability
()
>=
3.0
&&
!
context
.
getUseDoublePrecision
())
...
@@ -461,8 +532,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
...
@@ -461,8 +532,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
forceArgs
.
push_back
(
&
context
.
getEnergyBuffer
().
getDevicePointer
());
forceArgs
.
push_back
(
&
context
.
getEnergyBuffer
().
getDevicePointer
());
forceArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
forceArgs
.
push_back
(
&
context
.
getPosq
().
getDevicePointer
());
forceArgs
.
push_back
(
&
exclusions
->
getDevicePointer
());
forceArgs
.
push_back
(
&
exclusions
->
getDevicePointer
());
forceArgs
.
push_back
(
&
exclusionIndices
->
getDevicePointer
());
forceArgs
.
push_back
(
&
exclusionTiles
->
getDevicePointer
());
forceArgs
.
push_back
(
&
exclusionRowIndices
->
getDevicePointer
());
forceArgs
.
push_back
(
&
startTileIndex
);
forceArgs
.
push_back
(
&
startTileIndex
);
forceArgs
.
push_back
(
&
numTiles
);
forceArgs
.
push_back
(
&
numTiles
);
if
(
useCutoff
)
{
if
(
useCutoff
)
{
...
@@ -471,7 +541,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
...
@@ -471,7 +541,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
forceArgs
.
push_back
(
context
.
getPeriodicBoxSizePointer
());
forceArgs
.
push_back
(
context
.
getPeriodicBoxSizePointer
());
forceArgs
.
push_back
(
context
.
getInvPeriodicBoxSizePointer
());
forceArgs
.
push_back
(
context
.
getInvPeriodicBoxSizePointer
());
forceArgs
.
push_back
(
&
maxTiles
);
forceArgs
.
push_back
(
&
maxTiles
);
forceArgs
.
push_back
(
&
interactionFlags
->
getDevicePointer
());
forceArgs
.
push_back
(
&
blockCenter
->
getDevicePointer
());
forceArgs
.
push_back
(
&
interactingAtoms
->
getDevicePointer
());
}
}
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
(
int
)
params
.
size
();
i
++
)
forceArgs
.
push_back
(
&
params
[
i
].
getMemory
());
forceArgs
.
push_back
(
&
params
[
i
].
getMemory
());
...
...
platforms/cuda/src/CudaNonbondedUtilities.h
View file @
93c467b2
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2009-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -36,6 +36,8 @@
...
@@ -36,6 +36,8 @@
namespace
OpenMM
{
namespace
OpenMM
{
class
CudaSort
;
/**
/**
* This class provides a generic interface for calculating nonbonded interactions. It does this in two
* This class provides a generic interface for calculating nonbonded interactions. It does this in two
* ways. First, it can be used to create kernels that evaluate nonbonded interactions. Clients
* ways. First, it can be used to create kernels that evaluate nonbonded interactions. Clients
...
@@ -181,10 +183,10 @@ public:
...
@@ -181,10 +183,10 @@ public:
return
*
interactingTiles
;
return
*
interactingTiles
;
}
}
/**
/**
* Get the array containing
flags for
tile
s
with interactions.
* Get the array containing
the atoms in each
tile with interactions.
*/
*/
CudaArray
&
getInteracti
onFlag
s
()
{
CudaArray
&
getInteracti
ngAtom
s
()
{
return
*
interacti
onFlag
s
;
return
*
interacti
ngAtom
s
;
}
}
/**
/**
* Get the array containing exclusion flags.
* Get the array containing exclusion flags.
...
@@ -192,6 +194,12 @@ public:
...
@@ -192,6 +194,12 @@ public:
CudaArray
&
getExclusions
()
{
CudaArray
&
getExclusions
()
{
return
*
exclusions
;
return
*
exclusions
;
}
}
/**
* Get the array containing tiles with exclusions.
*/
CudaArray
&
getExclusionTiles
()
{
return
*
exclusionTiles
;
}
/**
/**
* Get the array containing the index into the exclusion array for each tile.
* Get the array containing the index into the exclusion array for each tile.
*/
*/
...
@@ -217,9 +225,17 @@ public:
...
@@ -217,9 +225,17 @@ public:
return
numTiles
;
return
numTiles
;
}
}
/**
/**
* Set the range of tiles that should be processed by this context.
* Set whether to add padding to the cutoff distance when building the neighbor list.
* This increases the size of the neighbor list (and thus the cost of computing interactions),
* but also means we don't need to rebuild it every time step. The default value is true,
* since usually this improves performance. For very expensive interactions, however,
* it may be better to set this to false.
*/
void
setUsePadding
(
bool
padding
);
/**
* Set the range of atom blocks and tiles that should be processed by this context.
*/
*/
void
set
TileRange
(
int
startTileIndex
,
int
numTiles
);
void
set
AtomBlockRange
(
double
startFraction
,
double
endFraction
);
/**
/**
* Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
* Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
* are assumed to be the same as those for the default interaction Kernel, since this kernel will use
* are assumed to be the same as those for the default interaction Kernel, since this kernel will use
...
@@ -232,42 +248,38 @@ public:
...
@@ -232,42 +248,38 @@ public:
* @param isSymmetric specifies whether the interaction is symmetric
* @param isSymmetric specifies whether the interaction is symmetric
*/
*/
CUfunction
createInteractionKernel
(
const
std
::
string
&
source
,
std
::
vector
<
ParameterInfo
>&
params
,
std
::
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
);
CUfunction
createInteractionKernel
(
const
std
::
string
&
source
,
std
::
vector
<
ParameterInfo
>&
params
,
std
::
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
);
/**
* This is a utility routine for locating data in the exclusions array. It takes the (x,y) indices of a tile,
* and returns the location in the array where the data for that tile begins.
*
* This routine requires that x >= y. If not, it will throw an exception.
*
* @param x the x index of the tile
* @param y the y index of the tile
* @param exclusionIndices the content of the exclusionIndices array
* @param exclusionRowIndices the content of the exclusionRowIndices array
* @return the index in the exclusions array at which the data for that tile begins
*/
static
int
findExclusionIndex
(
int
x
,
int
y
,
const
std
::
vector
<
unsigned
int
>&
exclusionIndices
,
const
std
::
vector
<
unsigned
int
>&
exclusionRowIndices
);
private:
private:
class
BlockSortTrait
;
CudaContext
&
context
;
CudaContext
&
context
;
CUfunction
forceKernel
;
CUfunction
forceKernel
;
CUfunction
findBlockBoundsKernel
;
CUfunction
findBlockBoundsKernel
;
CUfunction
sortBoxDataKernel
;
CUfunction
findInteractingBlocksKernel
;
CUfunction
findInteractingBlocksKernel
;
CUfunction
findInteractionsWithinBlocksKernel
;
CUfunction
findInteractionsWithinBlocksKernel
;
CudaArray
*
exclusionTiles
;
CudaArray
*
exclusions
;
CudaArray
*
exclusions
;
CudaArray
*
exclusionIndices
;
CudaArray
*
exclusionIndices
;
CudaArray
*
exclusionRowIndices
;
CudaArray
*
exclusionRowIndices
;
CudaArray
*
interactingTiles
;
CudaArray
*
interactingTiles
;
CudaArray
*
interacti
onFlag
s
;
CudaArray
*
interacti
ngAtom
s
;
CudaArray
*
interactionCount
;
CudaArray
*
interactionCount
;
CudaArray
*
blockCenter
;
CudaArray
*
blockCenter
;
CudaArray
*
blockBoundingBox
;
CudaArray
*
blockBoundingBox
;
std
::
vector
<
void
*>
forceArgs
,
findBlockBoundsArgs
,
findInteractingBlocksArgs
,
findInteractionsWithinBlocksArgs
;
CudaArray
*
sortedBlocks
;
CudaArray
*
sortedBlockCenter
;
CudaArray
*
sortedBlockBoundingBox
;
CudaArray
*
oldPositions
;
CudaArray
*
rebuildNeighborList
;
CudaSort
*
blockSorter
;
std
::
vector
<
void
*>
forceArgs
,
findBlockBoundsArgs
,
sortBoxDataArgs
,
findInteractingBlocksArgs
;
std
::
vector
<
std
::
vector
<
int
>
>
atomExclusions
;
std
::
vector
<
std
::
vector
<
int
>
>
atomExclusions
;
std
::
vector
<
ParameterInfo
>
parameters
;
std
::
vector
<
ParameterInfo
>
parameters
;
std
::
vector
<
ParameterInfo
>
arguments
;
std
::
vector
<
ParameterInfo
>
arguments
;
std
::
string
kernelSource
;
std
::
string
kernelSource
;
std
::
map
<
std
::
string
,
std
::
string
>
kernelDefines
;
std
::
map
<
std
::
string
,
std
::
string
>
kernelDefines
;
double
cutoff
;
double
cutoff
;
bool
useCutoff
,
usePeriodic
,
anyExclusions
;
bool
useCutoff
,
usePeriodic
,
anyExclusions
,
usePadding
;
int
startTileIndex
,
numTiles
,
maxTiles
,
numForceThreadBlocks
,
forceThreadBlockSize
,
nonbondedForceGroup
,
numAtoms
;
int
startTileIndex
,
numTiles
,
startBlockIndex
,
numBlocks
,
maxTiles
,
numForceThreadBlocks
,
forceThreadBlockSize
,
nonbondedForceGroup
,
numAtoms
;
};
};
/**
/**
...
...
platforms/cuda/src/CudaParallelKernels.cpp
View file @
93c467b2
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2011-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2011-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -118,7 +118,7 @@ private:
...
@@ -118,7 +118,7 @@ private:
};
};
CudaParallelCalcForcesAndEnergyKernel
::
CudaParallelCalcForcesAndEnergyKernel
(
string
name
,
const
Platform
&
platform
,
CudaPlatform
::
PlatformData
&
data
)
:
CudaParallelCalcForcesAndEnergyKernel
::
CudaParallelCalcForcesAndEnergyKernel
(
string
name
,
const
Platform
&
platform
,
CudaPlatform
::
PlatformData
&
data
)
:
CalcForcesAndEnergyKernel
(
name
,
platform
),
data
(
data
),
completionTimes
(
data
.
contexts
.
size
()),
context
Tile
s
(
data
.
contexts
.
size
()),
contextForces
(
NULL
),
CalcForcesAndEnergyKernel
(
name
,
platform
),
data
(
data
),
completionTimes
(
data
.
contexts
.
size
()),
context
NonbondedFraction
s
(
data
.
contexts
.
size
()),
contextForces
(
NULL
),
pinnedPositionBuffer
(
NULL
),
pinnedForceBuffer
(
NULL
)
{
pinnedPositionBuffer
(
NULL
),
pinnedForceBuffer
(
NULL
)
{
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
kernels
.
push_back
(
Kernel
(
new
CudaCalcForcesAndEnergyKernel
(
name
,
platform
,
*
data
.
contexts
[
i
])));
kernels
.
push_back
(
Kernel
(
new
CudaCalcForcesAndEnergyKernel
(
name
,
platform
,
*
data
.
contexts
[
i
])));
...
@@ -141,6 +141,8 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
...
@@ -141,6 +141,8 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
sumKernel
=
cu
.
getKernel
(
module
,
"sumForces"
);
sumKernel
=
cu
.
getKernel
(
module
,
"sumForces"
);
for
(
int
i
=
0
;
i
<
(
int
)
kernels
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
(
int
)
kernels
.
size
();
i
++
)
getKernel
(
i
).
initialize
(
system
);
getKernel
(
i
).
initialize
(
system
);
for
(
int
i
=
0
;
i
<
(
int
)
contextNonbondedFractions
.
size
();
i
++
)
contextNonbondedFractions
[
i
]
=
1
/
(
double
)
contextNonbondedFractions
.
size
();
}
}
void
CudaParallelCalcForcesAndEnergyKernel
::
beginComputation
(
ContextImpl
&
context
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
)
{
void
CudaParallelCalcForcesAndEnergyKernel
::
beginComputation
(
ContextImpl
&
context
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
)
{
...
@@ -184,30 +186,26 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
...
@@ -184,30 +186,26 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
void
*
args
[]
=
{
&
cu
.
getForce
().
getDevicePointer
(),
&
contextForces
->
getDevicePointer
(),
&
bufferSize
,
&
numBuffers
};
void
*
args
[]
=
{
&
cu
.
getForce
().
getDevicePointer
(),
&
contextForces
->
getDevicePointer
(),
&
bufferSize
,
&
numBuffers
};
cu
.
executeKernel
(
sumKernel
,
args
,
bufferSize
);
cu
.
executeKernel
(
sumKernel
,
args
,
bufferSize
);
// Balance work between the contexts by transferring a
few
nonbonded
tiles
from the context that
// Balance work between the contexts by transferring a
little
nonbonded
work
from the context that
// finished last to the one that finished first.
// finished last to the one that finished first.
int
firstIndex
=
0
,
lastIndex
=
0
;
int
firstIndex
=
0
,
lastIndex
=
0
;
int
totalTiles
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
completionTimes
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
completionTimes
.
size
();
i
++
)
{
if
(
completionTimes
[
i
]
<
completionTimes
[
firstIndex
])
if
(
completionTimes
[
i
]
<
completionTimes
[
firstIndex
])
firstIndex
=
i
;
firstIndex
=
i
;
if
(
completionTimes
[
i
]
>
completionTimes
[
lastIndex
])
if
(
completionTimes
[
i
]
>
completionTimes
[
lastIndex
])
lastIndex
=
i
;
lastIndex
=
i
;
contextTiles
[
i
]
=
data
.
contexts
[
i
]
->
getNonbondedUtilities
().
getNumTiles
();
}
totalTiles
+=
contextTiles
[
i
];
double
fractionToTransfer
=
min
(
0.001
,
contextNonbondedFractions
[
lastIndex
]);
}
contextNonbondedFractions
[
firstIndex
]
+=
fractionToTransfer
;
int
tilesToTransfer
=
totalTiles
/
1000
;
contextNonbondedFractions
[
lastIndex
]
-=
fractionToTransfer
;
if
(
tilesToTransfer
<
1
)
double
startFraction
=
0.0
;
tilesToTransfer
=
1
;
for
(
int
i
=
0
;
i
<
(
int
)
contextNonbondedFractions
.
size
();
i
++
)
{
if
(
tilesToTransfer
>
contextTiles
[
lastIndex
])
double
endFraction
=
startFraction
+
contextNonbondedFractions
[
i
];
tilesToTransfer
=
contextTiles
[
lastIndex
];
if
(
i
==
contextNonbondedFractions
.
size
()
-
1
)
contextTiles
[
firstIndex
]
+=
tilesToTransfer
;
endFraction
=
1.0
;
// Avoid roundoff error
contextTiles
[
lastIndex
]
-=
tilesToTransfer
;
data
.
contexts
[
i
]
->
getNonbondedUtilities
().
setAtomBlockRange
(
startFraction
,
endFraction
);
int
startIndex
=
0
;
startFraction
=
endFraction
;
for
(
int
i
=
0
;
i
<
(
int
)
contextTiles
.
size
();
i
++
)
{
data
.
contexts
[
i
]
->
getNonbondedUtilities
().
setTileRange
(
startIndex
,
contextTiles
[
i
]);
startIndex
+=
contextTiles
[
i
];
}
}
}
}
return
energy
;
return
energy
;
...
...
platforms/cuda/src/CudaParallelKernels.h
View file @
93c467b2
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2011-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2011-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -80,7 +80,7 @@ private:
...
@@ -80,7 +80,7 @@ private:
CudaPlatform
::
PlatformData
&
data
;
CudaPlatform
::
PlatformData
&
data
;
std
::
vector
<
Kernel
>
kernels
;
std
::
vector
<
Kernel
>
kernels
;
std
::
vector
<
long
long
>
completionTimes
;
std
::
vector
<
long
long
>
completionTimes
;
std
::
vector
<
int
>
context
Tile
s
;
std
::
vector
<
double
>
context
NonbondedFraction
s
;
CudaArray
*
contextForces
;
CudaArray
*
contextForces
;
void
*
pinnedPositionBuffer
;
void
*
pinnedPositionBuffer
;
long
long
*
pinnedForceBuffer
;
long
long
*
pinnedForceBuffer
;
...
...
platforms/cuda/src/CudaSort.cpp
View file @
93c467b2
...
@@ -32,7 +32,7 @@ using namespace OpenMM;
...
@@ -32,7 +32,7 @@ using namespace OpenMM;
using
namespace
std
;
using
namespace
std
;
CudaSort
::
CudaSort
(
CudaContext
&
context
,
SortTrait
*
trait
,
unsigned
int
length
)
:
context
(
context
),
trait
(
trait
),
CudaSort
::
CudaSort
(
CudaContext
&
context
,
SortTrait
*
trait
,
unsigned
int
length
)
:
context
(
context
),
trait
(
trait
),
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
)
{
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
)
,
dataLength
(
length
)
{
// Create kernels.
// Create kernels.
map
<
string
,
string
>
replacements
;
map
<
string
,
string
>
replacements
;
...
@@ -43,6 +43,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
...
@@ -43,6 +43,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
replacements
[
"MAX_KEY"
]
=
trait
->
getMaxKey
();
replacements
[
"MAX_KEY"
]
=
trait
->
getMaxKey
();
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
CUmodule
module
=
context
.
createModule
(
context
.
replaceStrings
(
CudaKernelSources
::
sort
,
replacements
));
CUmodule
module
=
context
.
createModule
(
context
.
replaceStrings
(
CudaKernelSources
::
sort
,
replacements
));
shortListKernel
=
context
.
getKernel
(
module
,
"sortShortList"
);
computeRangeKernel
=
context
.
getKernel
(
module
,
"computeRange"
);
computeRangeKernel
=
context
.
getKernel
(
module
,
"computeRange"
);
assignElementsKernel
=
context
.
getKernel
(
module
,
"assignElementsToBuckets"
);
assignElementsKernel
=
context
.
getKernel
(
module
,
"assignElementsToBuckets"
);
computeBucketPositionsKernel
=
context
.
getKernel
(
module
,
"computeBucketPositions"
);
computeBucketPositionsKernel
=
context
.
getKernel
(
module
,
"computeBucketPositions"
);
...
@@ -53,15 +54,16 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
...
@@ -53,15 +54,16 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
int
maxBlockSize
;
int
maxBlockSize
;
cuDeviceGetAttribute
(
&
maxBlockSize
,
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X
,
context
.
getDevice
());
cuDeviceGetAttribute
(
&
maxBlockSize
,
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X
,
context
.
getDevice
());
int
maxSharedMem
;
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
isShortList
=
(
length
<=
maxLocalBuffer
);
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxBlockSize
;
rangeKernelSize
*=
2
)
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxBlockSize
;
rangeKernelSize
*=
2
)
;
;
positionsKernelSize
=
rangeKernelSize
;
positionsKernelSize
=
rangeKernelSize
;
sortKernelSize
=
rangeKernelSize
/
2
;
sortKernelSize
=
(
isShortList
?
rangeKernelSize
/
2
:
rangeKernelSize
/
4
)
;
if
(
rangeKernelSize
>
length
)
if
(
rangeKernelSize
>
length
)
rangeKernelSize
=
length
;
rangeKernelSize
=
length
;
int
maxSharedMem
;
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
if
(
sortKernelSize
>
maxLocalBuffer
)
if
(
sortKernelSize
>
maxLocalBuffer
)
sortKernelSize
=
maxLocalBuffer
;
sortKernelSize
=
maxLocalBuffer
;
unsigned
int
targetBucketSize
=
sortKernelSize
/
2
;
unsigned
int
targetBucketSize
=
sortKernelSize
/
2
;
...
@@ -73,11 +75,13 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
...
@@ -73,11 +75,13 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
// Create workspace arrays.
// Create workspace arrays.
if
(
!
isShortList
)
{
dataRange
=
new
CudaArray
(
context
,
2
,
trait
->
getKeySize
(),
"sortDataRange"
);
dataRange
=
new
CudaArray
(
context
,
2
,
trait
->
getKeySize
(),
"sortDataRange"
);
bucketOffset
=
CudaArray
::
create
<
uint1
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOffset
=
CudaArray
::
create
<
uint1
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOfElement
=
CudaArray
::
create
<
uint1
>
(
context
,
length
,
"bucketOfElement"
);
bucketOfElement
=
CudaArray
::
create
<
uint1
>
(
context
,
length
,
"bucketOfElement"
);
offsetInBucket
=
CudaArray
::
create
<
uint1
>
(
context
,
length
,
"offsetInBucket"
);
offsetInBucket
=
CudaArray
::
create
<
uint1
>
(
context
,
length
,
"offsetInBucket"
);
buckets
=
new
CudaArray
(
context
,
length
,
trait
->
getDataSize
(),
"buckets"
);
buckets
=
new
CudaArray
(
context
,
length
,
trait
->
getDataSize
(),
"buckets"
);
}
}
}
CudaSort
::~
CudaSort
()
{
CudaSort
::~
CudaSort
()
{
...
@@ -95,22 +99,27 @@ CudaSort::~CudaSort() {
...
@@ -95,22 +99,27 @@ CudaSort::~CudaSort() {
}
}
void
CudaSort
::
sort
(
CudaArray
&
data
)
{
void
CudaSort
::
sort
(
CudaArray
&
data
)
{
if
(
data
.
getSize
()
!=
bucketOfElement
->
getSize
()
||
data
.
getElementSize
()
!=
trait
->
getDataSize
())
if
(
data
.
getSize
()
!=
dataLength
||
data
.
getElementSize
()
!=
trait
->
getDataSize
())
throw
OpenMMException
(
"CudaSort called with different data size"
);
throw
OpenMMException
(
"CudaSort called with different data size"
);
if
(
data
.
getSize
()
==
0
)
if
(
data
.
getSize
()
==
0
)
return
;
return
;
if
(
isShortList
)
{
// We can use a simpler sort kernel that does the entire operation at once in local memory.
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
};
context
.
executeKernel
(
shortListKernel
,
sortArgs
,
sortKernelSize
,
sortKernelSize
,
dataLength
*
trait
->
getDataSize
());
}
else
{
// Compute the range of data values.
// Compute the range of data values.
unsigned
int
dataSize
=
data
.
getSize
();
void
*
rangeArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
,
&
dataRange
->
getDevicePointer
()};
void
*
rangeArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataSize
,
&
dataRange
->
getDevicePointer
()};
context
.
executeKernel
(
computeRangeKernel
,
rangeArgs
,
rangeKernelSize
,
rangeKernelSize
,
rangeKernelSize
*
trait
->
getKeySize
());
context
.
executeKernel
(
computeRangeKernel
,
rangeArgs
,
rangeKernelSize
,
rangeKernelSize
,
rangeKernelSize
*
trait
->
getKeySize
());
// Assign array elements to buckets.
// Assign array elements to buckets.
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
context
.
clearBuffer
(
*
bucketOffset
);
context
.
clearBuffer
(
*
bucketOffset
);
void
*
elementsArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
data
Size
,
&
numBuckets
,
&
dataRange
->
getDevicePointer
(),
void
*
elementsArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
data
Length
,
&
numBuckets
,
&
dataRange
->
getDevicePointer
(),
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
assignElementsKernel
,
elementsArgs
,
data
.
getSize
());
context
.
executeKernel
(
assignElementsKernel
,
elementsArgs
,
data
.
getSize
());
...
@@ -121,7 +130,7 @@ void CudaSort::sort(CudaArray& data) {
...
@@ -121,7 +130,7 @@ void CudaSort::sort(CudaArray& data) {
// Copy the data into the buckets.
// Copy the data into the buckets.
void
*
copyArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
data
Size
,
&
bucketOffset
->
getDevicePointer
(),
void
*
copyArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
data
Length
,
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
copyToBucketsKernel
,
copyArgs
,
data
.
getSize
());
context
.
executeKernel
(
copyToBucketsKernel
,
copyArgs
,
data
.
getSize
());
...
@@ -129,4 +138,5 @@ void CudaSort::sort(CudaArray& data) {
...
@@ -129,4 +138,5 @@ void CudaSort::sort(CudaArray& data) {
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
context
.
executeKernel
(
sortBucketsKernel
,
sortArgs
,
((
data
.
getSize
()
+
sortKernelSize
-
1
)
/
sortKernelSize
)
*
sortKernelSize
,
sortKernelSize
,
sortKernelSize
*
trait
->
getDataSize
());
context
.
executeKernel
(
sortBucketsKernel
,
sortArgs
,
((
data
.
getSize
()
+
sortKernelSize
-
1
)
/
sortKernelSize
)
*
sortKernelSize
,
sortKernelSize
,
sortKernelSize
*
trait
->
getDataSize
());
}
}
}
platforms/cuda/src/CudaSort.h
View file @
93c467b2
...
@@ -92,8 +92,9 @@ private:
...
@@ -92,8 +92,9 @@ private:
CudaArray
*
offsetInBucket
;
CudaArray
*
offsetInBucket
;
CudaArray
*
bucketOffset
;
CudaArray
*
bucketOffset
;
CudaArray
*
buckets
;
CudaArray
*
buckets
;
CUfunction
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
CUfunction
shortListKernel
,
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
unsigned
int
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
unsigned
int
dataLength
,
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
bool
isShortList
;
};
};
/**
/**
...
...
platforms/cuda/src/kernels/coulombLennardJones.cu
View file @
93c467b2
#if USE_EWALD
#if USE_EWALD
bool
needCorrection
=
isExcluded
&&
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
;
bool
needCorrection
=
hasExclusions
&&
isExcluded
&&
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
;
if
(
!
isExcluded
||
needCorrection
)
{
if
(
!
isExcluded
||
needCorrection
)
{
real
tempForce
=
0.0
f
;
if
(
r2
<
CUTOFF_SQUARED
||
needCorrection
)
{
if
(
r2
<
CUTOFF_SQUARED
||
needCorrection
)
{
const
real
alphaR
=
EWALD_ALPHA
*
r
;
const
real
alphaR
=
EWALD_ALPHA
*
r
;
const
real
expAlphaRSqr
=
EXP
(
-
alphaR
*
alphaR
);
const
real
expAlphaRSqr
=
EXP
(
-
alphaR
*
alphaR
);
...
@@ -16,6 +15,7 @@ if (!isExcluded || needCorrection) {
...
@@ -16,6 +15,7 @@ if (!isExcluded || needCorrection) {
t
*=
t
;
t
*=
t
;
t
*=
t
;
t
*=
t
;
const
real
erfcAlphaR
=
RECIP
(
t
*
t
);
const
real
erfcAlphaR
=
RECIP
(
t
*
t
);
real
tempForce
=
0.0
f
;
if
(
needCorrection
)
{
if
(
needCorrection
)
{
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
...
@@ -36,8 +36,8 @@ if (!isExcluded || needCorrection) {
...
@@ -36,8 +36,8 @@ if (!isExcluded || needCorrection) {
tempEnergy
+=
prefactor
*
erfcAlphaR
;
tempEnergy
+=
prefactor
*
erfcAlphaR
;
#endif
#endif
}
}
}
dEdR
+=
tempForce
*
invR
*
invR
;
dEdR
+=
tempForce
*
invR
*
invR
;
}
}
}
#else
#else
{
{
...
...
platforms/cuda/src/kernels/customGBEnergyN2.cu
View file @
93c467b2
#define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0x100000000)));
#define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0x100000000)));
#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0x100000000)));
#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0x100000000)));
#define TILE_SIZE 32
typedef
struct
{
typedef
struct
{
real4
posq
;
real4
posq
;
...
@@ -15,88 +14,43 @@ typedef struct {
...
@@ -15,88 +14,43 @@ typedef struct {
* Compute a force based on pair interactions.
* Compute a force based on pair interactions.
*/
*/
extern
"C"
__global__
void
computeN2Energy
(
unsigned
long
long
*
__restrict__
forceBuffers
,
real
*
__restrict__
energyBuffer
,
extern
"C"
__global__
void
computeN2Energy
(
unsigned
long
long
*
__restrict__
forceBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
unsigned
int
*
__restrict__
exclusionIndices
,
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
ushort2
*
__restrict__
exclusionTiles
,
const
unsigned
int
*
__restrict__
exclusionRowIndices
,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interacti
onFlag
s
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
unsigned
int
*
__restrict__
interacti
ngAtom
s
#else
#else
unsigned
int
numTiles
unsigned
int
numTiles
#endif
#endif
PARAMETER_ARGUMENTS
)
{
PARAMETER_ARGUMENTS
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
#ifdef USE_CUTOFF
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
unsigned
int
numTiles
=
interactionCount
[
0
];
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
unsigned
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
unsigned
int
pos
=
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
real
energy
=
0
;
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
do
{
// First loop: process tiles that contain exclusions.
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
unsigned
int
x
,
y
;
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
real3
force
=
make_real3
(
0
);
real3
force
=
make_real3
(
0
);
DECLARE_ATOM1_DERIVATIVES
DECLARE_ATOM1_DERIVATIVES
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
}
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
if
(
tgx
<
2
)
unsigned
int
excl
=
exclusions
[
pos
*
TILE_SIZE
+
tgx
];
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
bool
hasExclusions
=
(
exclusionIndex
[
localGroupIndex
]
>
-
1
);
#else
bool
hasExclusions
=
false
;
#endif
#endif
if
(
pos
>=
end
)
if
(
x
==
y
)
{
;
// This warp is done.
else
if
(
x
==
y
)
{
// This tile is on the diagonal.
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
localData
[
localAtomIndex
].
posq
=
posq1
;
localData
[
localAtomIndex
].
posq
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
exclusions
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
#endif
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
int
atom2
=
tbx
+
j
;
int
atom2
=
tbx
+
j
;
real4
posq2
=
localData
[
atom2
].
posq
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
...
@@ -115,6 +69,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
...
@@ -115,6 +69,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
atom2
=
y
*
TILE_SIZE
+
j
;
atom2
=
y
*
TILE_SIZE
+
j
;
real
dEdR
=
0
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
real
tempEnergy
=
0
;
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
COMPUTE_INTERACTION
COMPUTE_INTERACTION
dEdR
/=
-
r
;
dEdR
/=
-
r
;
...
@@ -136,32 +93,16 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
...
@@ -136,32 +93,16 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
// This is an off-diagonal tile.
// This is an off-diagonal tile.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData
[
localAtomIndex
].
force
=
make_real3
(
0
);
localData
[
localAtomIndex
].
force
=
make_real3
(
0
);
CLEAR_LOCAL_DERIVATIVES
CLEAR_LOCAL_DERIVATIVES
#ifdef USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
if
(
!
hasExclusions
&&
flags
==
0
)
{
// No interactions in this tile.
}
else
#endif
{
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
0xFFFFFFFF
);
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
));
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
));
#endif
#endif
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
int
atom2
=
tbx
+
tj
;
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
...
@@ -180,6 +121,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
...
@@ -180,6 +121,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
atom2
=
y
*
TILE_SIZE
+
tj
;
atom2
=
y
*
TILE_SIZE
+
tj
;
real
dEdR
=
0
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
real
tempEnergy
=
0
;
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
COMPUTE_INTERACTION
dEdR
/=
-
r
;
dEdR
/=
-
r
;
...
@@ -203,27 +147,216 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
...
@@ -203,27 +147,216 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
}
}
lasty
=
y
;
// Write results.
// Write results.
if
(
pos
<
end
)
{
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
STORE_DERIVATIVES_1
STORE_DERIVATIVES_1
}
if
(
x
!=
y
)
{
if
(
pos
<
end
&&
x
!=
y
)
{
offset
=
y
*
TILE_SIZE
+
tgx
;
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
STORE_DERIVATIVES_2
STORE_DERIVATIVES_2
}
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
int
pos
=
warp
*
numTiles
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
__shared__
int
skipTiles
[
THREAD_BLOCK_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
while
(
pos
<
end
)
{
const
bool
isExcluded
=
false
;
real3
force
=
make_real3
(
0
);
DECLARE_ATOM1_DERIVATIVES
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
unsigned
int
x
,
y
;
bool
singlePeriodicCopy
=
false
;
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
singlePeriodicCopy
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
force
=
make_real3
(
0
);
CLEAR_LOCAL_DERIVATIVES
}
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
localData
[
threadIdx
.
x
].
posq
.
x
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
localData
[
threadIdx
.
x
].
posq
.
y
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
localData
[
threadIdx
.
x
].
posq
.
z
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices
[
tbx
+
tj
];
real
dEdR
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-
r
;
}
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
atom2
=
tbx
+
tj
;
localData
[
atom2
].
force
.
x
+=
delta
.
x
;
localData
[
atom2
].
force
.
y
+=
delta
.
y
;
localData
[
atom2
].
force
.
z
+=
delta
.
z
;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices
[
tbx
+
tj
];
real
dEdR
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-
r
;
}
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
atom2
=
tbx
+
tj
;
localData
[
atom2
].
force
.
x
+=
delta
.
x
;
localData
[
atom2
].
force
.
y
+=
delta
.
y
;
localData
[
atom2
].
force
.
z
+=
delta
.
z
;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
// Write results.
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
unsigned
int
offset
=
atom1
;
STORE_DERIVATIVES_1
#ifdef USE_CUTOFF
unsigned
int
atom2
=
atomIndices
[
threadIdx
.
x
];
#else
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
#endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
offset
=
atom2
;
STORE_DERIVATIVES_2
}
}
pos
++
;
pos
++
;
}
while
(
pos
<
end
);
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
}
platforms/cuda/src/kernels/customGBValueN2.cu
View file @
93c467b2
#define TILE_SIZE 32
typedef
struct
{
typedef
struct
{
real4
posq
;
real4
posq
;
real
value
,
temp
;
real
value
,
temp
;
...
@@ -13,86 +11,41 @@ typedef struct {
...
@@ -13,86 +11,41 @@ typedef struct {
* Compute a value based on pair interactions.
* Compute a value based on pair interactions.
*/
*/
extern
"C"
__global__
void
computeN2Value
(
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
extern
"C"
__global__
void
computeN2Value
(
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
u
nsigned
int
*
__restrict__
exclusion
Indices
,
const
unsigned
int
*
__restrict__
exclusionRowIndic
es
,
unsigned
long
long
*
__restrict__
global_value
,
const
u
short2
*
__restrict__
exclusion
Til
es
,
unsigned
long
long
*
__restrict__
global_value
,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interacti
onFlag
s
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
unsigned
int
*
__restrict__
interacti
ngAtom
s
#else
#else
unsigned
int
numTiles
unsigned
int
numTiles
#endif
#endif
PARAMETER_ARGUMENTS
)
{
PARAMETER_ARGUMENTS
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
unsigned
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
unsigned
int
pos
=
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
do
{
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
unsigned
int
x
,
y
;
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
real
value
=
0
;
real
value
=
0
;
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
}
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
if
(
tgx
<
2
)
unsigned
int
excl
=
exclusions
[
pos
*
TILE_SIZE
+
tgx
];
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
bool
hasExclusions
=
(
exclusionIndex
[
localGroupIndex
]
>
-
1
);
#else
bool
hasExclusions
=
false
;
#endif
#endif
if
(
pos
>=
end
)
if
(
x
==
y
)
{
;
// This warp is done.
else
if
(
x
==
y
)
{
// This tile is on the diagonal.
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
localData
[
localAtomIndex
].
posq
=
posq1
;
localData
[
localAtomIndex
].
posq
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
exclusions
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
#endif
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
int
atom2
=
tbx
+
j
;
int
atom2
=
tbx
+
j
;
real4
posq2
=
localData
[
atom2
].
posq
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
...
@@ -112,7 +65,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
...
@@ -112,7 +65,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
real
tempValue1
=
0
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
real
tempValue2
=
0
;
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
if
(
!
isExcluded
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
||
atom2
>=
NUM_ATOMS
||
!
(
excl
&
0x1
));
if
(
!
isExcluded
&&
atom1
!=
atom2
)
{
#else
#else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
#endif
#endif
...
@@ -130,25 +84,17 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
...
@@ -130,25 +84,17 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
else
{
else
{
// This is an off-diagonal tile.
// This is an off-diagonal tile.
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
localData
[
threadIdx
.
x
].
posq
=
posq
[
j
];
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData
[
localAtomIndex
].
value
=
0
;
localData
[
threadIdx
.
x
].
value
=
0
;
#ifdef USE_EXCLUSIONS
#ifdef USE_CUTOFF
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
));
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
#endif
if
(
!
hasExclusions
&&
flags
!=
0xFFFFFFFF
)
{
unsigned
int
tj
=
tgx
;
if
(
flags
==
0
)
{
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
// No interactions in this tile.
int
atom2
=
tbx
+
tj
;
}
else
{
// Compute only a subset of the interactions in this tile.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
int
atom2
=
tbx
+
j
;
real4
posq2
=
localData
[
atom2
].
posq
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
...
@@ -157,44 +103,162 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
...
@@ -157,44 +103,162 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
tempValue1
=
0
;
#ifdef USE_CUTOFF
real
tempValue2
=
0
;
if
(
r2
<
CUTOFF_SQUARED
)
{
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
j
;
atom2
=
y
*
TILE_SIZE
+
tj
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
||
atom2
>=
NUM_ATOMS
||
!
(
excl
&
0x1
));
if
(
!
isExcluded
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#endif
COMPUTE_VALUE
COMPUTE_VALUE
}
}
value
+=
tempValue1
;
value
+=
tempValue1
;
localData
[
tbx
+
tj
].
value
+=
tempValue2
;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
localData
[
threadIdx
.
x
].
temp
=
tempValue2
;
// Sum the forces on atom2
.
// Write results
.
if
(
tgx
%
4
==
0
)
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
localData
[
threadIdx
.
x
].
temp
+=
lo
c
al
Data
[
threadIdx
.
x
+
1
].
temp
+
localData
[
threadIdx
.
x
+
2
].
temp
+
localData
[
threadIdx
.
x
+
3
].
temp
;
atomicAdd
(
&
g
lo
b
al
_value
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
value
*
0x100000000
)))
;
if
(
tg
x
=
=
0
)
if
(
x
!
=
y
)
{
localData
[
tbx
+
j
].
value
+=
localData
[
threadIdx
.
x
].
temp
+
localData
[
threadIdx
.
x
+
4
].
temp
+
localData
[
threadIdx
.
x
+
8
].
temp
+
localData
[
threadIdx
.
x
+
12
].
temp
+
localData
[
threadIdx
.
x
+
16
].
temp
+
localData
[
threadIdx
.
x
+
20
].
temp
+
localData
[
threadIdx
.
x
+
24
].
temp
+
localData
[
threadIdx
.
x
+
28
].
temp
;
offset
=
y
*
TILE_SIZE
+
tgx
;
}
atomicAdd
(
&
global_value
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
value
*
0x100000000
)));
}
}
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
int
pos
=
warp
*
numTiles
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
__shared__
int
skipTiles
[
THREAD_BLOCK_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
while
(
pos
<
end
)
{
real
value
=
0
;
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
unsigned
int
x
,
y
;
bool
singlePeriodicCopy
=
false
;
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
singlePeriodicCopy
=
tileIndices
.
y
;
}
}
else
else
#endif
#endif
{
{
// Compute the full set of interactions in this tile.
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
#ifdef USE_EXCLUSIONS
// Skip over tiles that have exclusions, since they were already processed.
unsigned
int
excl
=
(
hasExclusions
?
exclusions
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
0xFFFFFFFF
);
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
));
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
value
=
0
;
}
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
localData
[
threadIdx
.
x
].
posq
.
x
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
localData
[
threadIdx
.
x
].
posq
.
y
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
localData
[
threadIdx
.
x
].
posq
.
z
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
#ifdef USE_EXCLUSIONS
int
atom2
=
tbx
+
tj
;
bool
isExcluded
=
!
(
excl
&
0x1
);
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices
[
tbx
+
tj
];
real
tempValue1
=
0
;
real
tempValue2
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_VALUE
}
value
+=
tempValue1
;
localData
[
tbx
+
tj
].
value
+=
tempValue2
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
else
#endif
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
...
@@ -210,41 +274,32 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
...
@@ -210,41 +274,32 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
real
invR
=
RSQRT
(
r2
);
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
tj
;
atom2
=
atomIndices
[
tbx
+
tj
]
;
real
tempValue1
=
0
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
real
tempValue2
=
0
;
#ifdef USE_EXCLUSIONS
if
(
!
isExcluded
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#endif
COMPUTE_VALUE
COMPUTE_VALUE
}
}
value
+=
tempValue1
;
value
+=
tempValue1
;
localData
[
tbx
+
tj
].
value
+=
tempValue2
;
localData
[
tbx
+
tj
].
value
+=
tempValue2
;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
}
}
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
#endif
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
}
}
// Write results.
// Write results.
if
(
pos
<
end
)
{
atomicAdd
(
&
global_value
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
value
*
0x100000000
)));
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
#ifdef USE_CUTOFF
atomicAdd
(
&
global_value
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
value
*
0x100000000
)));
unsigned
int
atom2
=
atomIndices
[
threadIdx
.
x
];
}
#else
if
(
pos
<
end
&&
x
!=
y
)
{
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomicAdd
(
&
global_value
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
value
*
0x100000000
)));
if
(
atom2
<
PADDED_NUM_ATOMS
)
atomicAdd
(
&
global_value
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
value
*
0x100000000
)));
}
}
lasty
=
y
;
pos
++
;
pos
++
;
}
while
(
pos
<
end
);
}
}
}
platforms/cuda/src/kernels/customHbondForce.cu
View file @
93c467b2
...
@@ -48,12 +48,12 @@ inline __device__ real computeAngle(real4 vec1, real4 vec2) {
...
@@ -48,12 +48,12 @@ inline __device__ real computeAngle(real4 vec1, real4 vec2) {
real3
crossProduct
=
cross
(
vec1
,
vec2
);
real3
crossProduct
=
cross
(
vec1
,
vec2
);
real
scale
=
vec1
.
w
*
vec2
.
w
;
real
scale
=
vec1
.
w
*
vec2
.
w
;
angle
=
asin
(
SQRT
(
dot
(
crossProduct
,
crossProduct
)
/
scale
));
angle
=
ASIN
(
SQRT
(
dot
(
crossProduct
,
crossProduct
)
/
scale
));
if
(
cosine
<
0.0
f
)
if
(
cosine
<
0.0
f
)
angle
=
M_PI
-
angle
;
angle
=
M_PI
-
angle
;
}
}
else
else
angle
=
acos
(
cosine
);
angle
=
ACOS
(
cosine
);
return
angle
;
return
angle
;
}
}
...
...
platforms/cuda/src/kernels/ewald.cu
View file @
93c467b2
...
@@ -35,11 +35,11 @@ extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuf
...
@@ -35,11 +35,11 @@ extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuf
for
(
int
atom
=
0
;
atom
<
NUM_ATOMS
;
atom
++
)
{
for
(
int
atom
=
0
;
atom
<
NUM_ATOMS
;
atom
++
)
{
real4
apos
=
posq
[
atom
];
real4
apos
=
posq
[
atom
];
real
phase
=
apos
.
x
*
kx
;
real
phase
=
apos
.
x
*
kx
;
real2
structureFactor
=
make_real2
(
cos
(
phase
),
sin
(
phase
));
real2
structureFactor
=
make_real2
(
COS
(
phase
),
SIN
(
phase
));
phase
=
apos
.
y
*
ky
;
phase
=
apos
.
y
*
ky
;
structureFactor
=
multofReal2
(
structureFactor
,
make_real2
(
cos
(
phase
),
sin
(
phase
)));
structureFactor
=
multofReal2
(
structureFactor
,
make_real2
(
COS
(
phase
),
SIN
(
phase
)));
phase
=
apos
.
z
*
kz
;
phase
=
apos
.
z
*
kz
;
structureFactor
=
multofReal2
(
structureFactor
,
make_real2
(
cos
(
phase
),
sin
(
phase
)));
structureFactor
=
multofReal2
(
structureFactor
,
make_real2
(
COS
(
phase
),
SIN
(
phase
)));
sum
+=
apos
.
w
*
structureFactor
;
sum
+=
apos
.
w
*
structureFactor
;
}
}
cosSinSum
[
index
]
=
sum
;
cosSinSum
[
index
]
=
sum
;
...
@@ -76,9 +76,9 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
...
@@ -76,9 +76,9 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
for
(
int
ry
=
lowry
;
ry
<
KMAX_Y
;
ry
++
)
{
for
(
int
ry
=
lowry
;
ry
<
KMAX_Y
;
ry
++
)
{
real
ky
=
ry
*
reciprocalBoxSize
.
y
;
real
ky
=
ry
*
reciprocalBoxSize
.
y
;
real
phase
=
apos
.
x
*
kx
;
real
phase
=
apos
.
x
*
kx
;
real2
tab_xy
=
make_real2
(
cos
(
phase
),
sin
(
phase
));
real2
tab_xy
=
make_real2
(
COS
(
phase
),
SIN
(
phase
));
phase
=
apos
.
y
*
ky
;
phase
=
apos
.
y
*
ky
;
tab_xy
=
multofReal2
(
tab_xy
,
make_real2
(
cos
(
phase
),
sin
(
phase
)));
tab_xy
=
multofReal2
(
tab_xy
,
make_real2
(
COS
(
phase
),
SIN
(
phase
)));
for
(
int
rz
=
lowrz
;
rz
<
KMAX_Z
;
rz
++
)
{
for
(
int
rz
=
lowrz
;
rz
<
KMAX_Z
;
rz
++
)
{
real
kz
=
rz
*
reciprocalBoxSize
.
z
;
real
kz
=
rz
*
reciprocalBoxSize
.
z
;
...
@@ -88,7 +88,7 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
...
@@ -88,7 +88,7 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
real
k2
=
kx
*
kx
+
ky
*
ky
+
kz
*
kz
;
real
k2
=
kx
*
kx
+
ky
*
ky
+
kz
*
kz
;
real
ak
=
EXP
(
k2
*
EXP_COEFFICIENT
)
/
k2
;
real
ak
=
EXP
(
k2
*
EXP_COEFFICIENT
)
/
k2
;
phase
=
apos
.
z
*
kz
;
phase
=
apos
.
z
*
kz
;
real2
structureFactor
=
multofReal2
(
tab_xy
,
make_real2
(
cos
(
phase
),
sin
(
phase
)));
real2
structureFactor
=
multofReal2
(
tab_xy
,
make_real2
(
COS
(
phase
),
SIN
(
phase
)));
real2
sum
=
cosSinSum
[
index
];
real2
sum
=
cosSinSum
[
index
];
real
dEdR
=
2
*
reciprocalCoefficient
*
ak
*
apos
.
w
*
(
sum
.
x
*
structureFactor
.
y
-
sum
.
y
*
structureFactor
.
x
);
real
dEdR
=
2
*
reciprocalCoefficient
*
ak
*
apos
.
w
*
(
sum
.
x
*
structureFactor
.
y
-
sum
.
y
*
structureFactor
.
x
);
force
.
x
+=
dEdR
*
kx
;
force
.
x
+=
dEdR
*
kx
;
...
...
platforms/cuda/src/kernels/findInteractingBlocks.cu
View file @
93c467b2
#define TILE_SIZE 32
#define GROUP_SIZE 256
#define GROUP_SIZE 64
#define BUFFER_GROUPS 2
#define BUFFER_GROUPS 4
#define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
#define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
#define WARP_SIZE 32
#define INVALID 0xFFFF
/**
/**
* Find a bounding box for the atoms in each block.
* Find a bounding box for the atoms in each block.
*/
*/
extern
"C"
__global__
void
findBlockBounds
(
int
numAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
real4
*
__restrict__
posq
,
real4
*
__restrict__
blockCenter
,
real4
*
__restrict__
blockBoundingBox
,
unsigned
int
*
__restrict__
interactionCount
)
{
extern
"C"
__global__
void
findBlockBounds
(
int
numAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
real4
*
__restrict__
posq
,
real4
*
__restrict__
blockCenter
,
real4
*
__restrict__
blockBoundingBox
,
int
*
__restrict__
rebuildNeighborList
,
real2
*
__restrict__
sortedBlocks
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
base
=
index
*
TILE_SIZE
;
int
base
=
index
*
TILE_SIZE
;
while
(
base
<
numAtoms
)
{
while
(
base
<
numAtoms
)
{
...
@@ -30,68 +32,231 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
...
@@ -30,68 +32,231 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
minPos
=
make_real4
(
min
(
minPos
.
x
,
pos
.
x
),
min
(
minPos
.
y
,
pos
.
y
),
min
(
minPos
.
z
,
pos
.
z
),
0
);
minPos
=
make_real4
(
min
(
minPos
.
x
,
pos
.
x
),
min
(
minPos
.
y
,
pos
.
y
),
min
(
minPos
.
z
,
pos
.
z
),
0
);
maxPos
=
make_real4
(
max
(
maxPos
.
x
,
pos
.
x
),
max
(
maxPos
.
y
,
pos
.
y
),
max
(
maxPos
.
z
,
pos
.
z
),
0
);
maxPos
=
make_real4
(
max
(
maxPos
.
x
,
pos
.
x
),
max
(
maxPos
.
y
,
pos
.
y
),
max
(
maxPos
.
z
,
pos
.
z
),
0
);
}
}
blockBoundingBox
[
index
]
=
0.5
f
*
(
maxPos
-
minPos
);
real4
blockSize
=
0.5
f
*
(
maxPos
-
minPos
);
blockBoundingBox
[
index
]
=
blockSize
;
blockCenter
[
index
]
=
0.5
f
*
(
maxPos
+
minPos
);
blockCenter
[
index
]
=
0.5
f
*
(
maxPos
+
minPos
);
sortedBlocks
[
index
]
=
make_real2
(
blockSize
.
x
+
blockSize
.
y
+
blockSize
.
z
,
index
);
index
+=
blockDim
.
x
*
gridDim
.
x
;
index
+=
blockDim
.
x
*
gridDim
.
x
;
base
=
index
*
TILE_SIZE
;
base
=
index
*
TILE_SIZE
;
}
}
if
(
blockIdx
.
x
==
0
&&
threadIdx
.
x
==
0
)
if
(
blockIdx
.
x
==
0
&&
threadIdx
.
x
==
0
)
interactionCoun
t
[
0
]
=
0
;
rebuildNeighborLis
t
[
0
]
=
0
;
}
}
/**
/**
* This is called by findBlocksWithInteractions(). It compacts the list of blocks and writes them
* Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
* to global memory.
*/
*/
__device__
void
storeInteractionData
(
ushort2
*
buffer
,
int
*
valid
,
short
*
sum
,
ushort2
*
temp
,
int
*
baseIndex
,
extern
"C"
__global__
void
sortBoxData
(
const
real2
*
__restrict__
sortedBlock
,
const
real4
*
__restrict__
blockCenter
,
unsigned
int
*
interactionCount
,
ushort2
*
interactingTiles
,
real4
periodicBoxSize
,
const
real4
*
__restrict__
blockBoundingBox
,
real4
*
__restrict__
sortedBlockCenter
,
real4
invPeriodicBoxSize
,
const
real4
*
posq
,
const
real4
*
blockCenter
,
const
real4
*
blockBoundingBox
,
unsigned
int
maxTiles
)
{
real4
*
__restrict__
sortedBlockBoundingBox
,
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
oldPositions
,
// The buffer is full, so we need to compact it and write out results. Start by doing a parallel prefix sum.
unsigned
int
*
__restrict__
interactionCount
,
int
*
__restrict__
rebuildNeighborList
)
{
for
(
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
i
<
NUM_BLOCKS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
index
=
(
int
)
sortedBlock
[
i
].
y
;
sortedBlockCenter
[
i
]
=
blockCenter
[
index
];
sortedBlockBoundingBox
[
i
]
=
blockBoundingBox
[
index
];
}
// Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
GROUP_SIZE
)
bool
rebuild
=
false
;
temp
[
i
].
x
=
(
valid
[
i
]
?
1
:
0
);
for
(
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
real4
delta
=
oldPositions
[
i
]
-
posq
[
i
];
if
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
>
0.25
f
*
PADDING
*
PADDING
)
rebuild
=
true
;
}
if
(
rebuild
)
{
rebuildNeighborList
[
0
]
=
1
;
interactionCount
[
0
]
=
0
;
}
}
/**
* Perform a parallel prefix sum over an array. The input values are all assumed to be 0 or 1.
*/
__device__
void
prefixSum
(
short
*
sum
,
ushort2
*
temp
)
{
#if __CUDA_ARCH__ >= 300
const
int
indexInWarp
=
threadIdx
.
x
%
WARP_SIZE
;
const
int
warpMask
=
(
2
<<
indexInWarp
)
-
1
;
for
(
int
base
=
0
;
base
<
BUFFER_SIZE
;
base
+=
blockDim
.
x
)
temp
[
base
+
threadIdx
.
x
].
x
=
__popc
(
__ballot
(
sum
[
base
+
threadIdx
.
x
])
&
warpMask
);
__syncthreads
();
if
(
threadIdx
.
x
<
BUFFER_SIZE
/
WARP_SIZE
)
{
int
multiWarpSum
=
temp
[(
threadIdx
.
x
+
1
)
*
WARP_SIZE
-
1
].
x
;
for
(
int
offset
=
1
;
offset
<
BUFFER_SIZE
/
WARP_SIZE
;
offset
*=
2
)
{
short
n
=
__shfl_up
(
multiWarpSum
,
offset
,
WARP_SIZE
);
if
(
indexInWarp
>=
offset
)
multiWarpSum
+=
n
;
}
temp
[
threadIdx
.
x
].
y
=
multiWarpSum
;
}
__syncthreads
();
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
sum
[
i
]
=
temp
[
i
].
x
+
(
i
<
WARP_SIZE
?
0
:
temp
[
i
/
WARP_SIZE
-
1
].
y
);
__syncthreads
();
#else
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
temp
[
i
].
x
=
sum
[
i
];
__syncthreads
();
__syncthreads
();
int
whichBuffer
=
0
;
int
whichBuffer
=
0
;
for
(
int
offset
=
1
;
offset
<
BUFFER_SIZE
;
offset
*=
2
)
{
for
(
int
offset
=
1
;
offset
<
BUFFER_SIZE
;
offset
*=
2
)
{
if
(
whichBuffer
==
0
)
if
(
whichBuffer
==
0
)
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
GROUP_SIZE
)
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
temp
[
i
].
y
=
(
i
<
offset
?
temp
[
i
].
x
:
temp
[
i
].
x
+
temp
[
i
-
offset
].
x
);
temp
[
i
].
y
=
(
i
<
offset
?
temp
[
i
].
x
:
temp
[
i
].
x
+
temp
[
i
-
offset
].
x
);
else
else
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
GROUP_SIZE
)
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
temp
[
i
].
x
=
(
i
<
offset
?
temp
[
i
].
y
:
temp
[
i
].
y
+
temp
[
i
-
offset
].
y
);
temp
[
i
].
x
=
(
i
<
offset
?
temp
[
i
].
y
:
temp
[
i
].
y
+
temp
[
i
-
offset
].
y
);
whichBuffer
=
1
-
whichBuffer
;
whichBuffer
=
1
-
whichBuffer
;
__syncthreads
();
__syncthreads
();
}
}
if
(
whichBuffer
==
0
)
if
(
whichBuffer
==
0
)
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
GROUP_SIZE
)
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
sum
[
i
]
=
temp
[
i
].
x
;
sum
[
i
]
=
temp
[
i
].
x
;
else
else
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
GROUP_SIZE
)
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
sum
[
i
]
=
temp
[
i
].
y
;
sum
[
i
]
=
temp
[
i
].
y
;
__syncthreads
();
__syncthreads
();
int
numValid
=
sum
[
BUFFER_SIZE
-
1
];
#endif
}
/**
* This is called by findBlocksWithInteractions(). It compacts the list of blocks, identifies interactions
* in them, and writes the result to global memory.
*/
__device__
void
storeInteractionData
(
unsigned
short
x
,
unsigned
short
*
buffer
,
short
*
sum
,
ushort2
*
temp
,
int
*
atoms
,
int
&
numAtoms
,
int
&
baseIndex
,
unsigned
int
*
interactionCount
,
ushort2
*
interactingTiles
,
unsigned
int
*
interactingAtoms
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
real4
*
posq
,
real3
*
posBuffer
,
real4
blockCenterX
,
real4
blockSizeX
,
unsigned
int
maxTiles
,
bool
finish
)
{
const
bool
singlePeriodicCopy
=
(
0.5
f
*
periodicBoxSize
.
x
-
blockSizeX
.
x
>=
PADDED_CUTOFF
&&
0.5
f
*
periodicBoxSize
.
y
-
blockSizeX
.
y
>=
PADDED_CUTOFF
&&
0.5
f
*
periodicBoxSize
.
z
-
blockSizeX
.
z
>=
PADDED_CUTOFF
);
if
(
threadIdx
.
x
<
TILE_SIZE
)
{
real3
pos
=
trimTo3
(
posq
[
x
*
TILE_SIZE
+
threadIdx
.
x
]);
posBuffer
[
threadIdx
.
x
]
=
pos
;
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
pos
.
x
-=
floor
((
pos
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
((
pos
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
((
pos
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
posBuffer
[
threadIdx
.
x
]
=
pos
;
}
#endif
}
// The buffer is full, so we need to compact it and write out results. Start by doing a parallel prefix sum.
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
sum
[
i
]
=
(
buffer
[
i
]
==
INVALID
?
0
:
1
);
__syncthreads
();
__syncthreads
();
prefixSum
(
sum
,
temp
);
int
numValid
=
sum
[
BUFFER_SIZE
-
1
];
// Compact the buffer.
// Compact the buffer.
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
GROUP_SIZE
)
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
if
(
valid
[
i
])
{
if
(
buffer
[
i
]
!=
INVALID
)
temp
[
sum
[
i
]
-
1
]
=
buffer
[
i
];
temp
[
sum
[
i
]
-
1
].
x
=
buffer
[
i
];
sum
[
i
]
=
valid
[
i
];
__syncthreads
();
valid
[
i
]
=
false
;
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
buffer
[
i
]
=
make_ushort2
(
1
,
1
);
buffer
[
i
]
=
temp
[
i
].
x
;
__syncthreads
();
// Loop over the tiles and find specific interactions in them.
const
int
indexInWarp
=
threadIdx
.
x
%
WARP_SIZE
;
for
(
int
base
=
0
;
base
<
numValid
;
base
+=
BUFFER_SIZE
/
WARP_SIZE
)
{
for
(
int
i
=
threadIdx
.
x
/
WARP_SIZE
;
i
<
BUFFER_SIZE
/
WARP_SIZE
&&
base
+
i
<
numValid
;
i
+=
GROUP_SIZE
/
WARP_SIZE
)
{
// Check each atom in block Y for interactions.
real3
pos
=
trimTo3
(
posq
[
buffer
[
base
+
i
]
*
TILE_SIZE
+
indexInWarp
]);
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
pos
.
x
-=
floor
((
pos
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
((
pos
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
((
pos
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
}
#endif
bool
interacts
=
false
;
#ifdef USE_PERIODIC
if
(
!
singlePeriodicCopy
)
{
for
(
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
pos
-
posBuffer
[
j
];
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
);
}
}
else
{
#endif
for
(
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
pos
-
posBuffer
[
j
];
interacts
|=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
);
}
#ifdef USE_PERIODIC
}
}
#endif
sum
[
i
*
WARP_SIZE
+
indexInWarp
]
=
(
interacts
?
1
:
0
);
}
for
(
int
i
=
numValid
-
base
+
threadIdx
.
x
/
WARP_SIZE
;
i
<
BUFFER_SIZE
/
WARP_SIZE
;
i
+=
GROUP_SIZE
/
WARP_SIZE
)
sum
[
i
*
WARP_SIZE
+
indexInWarp
]
=
0
;
// Compact the list of atoms.
__syncthreads
();
__syncthreads
();
prefixSum
(
sum
,
temp
);
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
if
(
sum
[
i
]
!=
(
i
==
0
?
0
:
sum
[
i
-
1
]))
atoms
[
numAtoms
+
sum
[
i
]
-
1
]
=
buffer
[
base
+
i
/
WARP_SIZE
]
*
TILE_SIZE
+
indexInWarp
;
// Store
i
t to global memory.
// Store t
hem
to global memory.
int
atomsToStore
=
numAtoms
+
sum
[
BUFFER_SIZE
-
1
];
bool
storePartialTile
=
(
finish
&&
base
>=
numValid
-
BUFFER_SIZE
/
WARP_SIZE
);
int
tilesToStore
=
(
storePartialTile
?
(
atomsToStore
+
TILE_SIZE
-
1
)
/
TILE_SIZE
:
atomsToStore
/
TILE_SIZE
);
if
(
tilesToStore
>
0
)
{
if
(
threadIdx
.
x
==
0
)
if
(
threadIdx
.
x
==
0
)
*
baseIndex
=
atomicAdd
(
interactionCount
,
numValid
);
baseIndex
=
atomicAdd
(
interactionCount
,
tilesToStore
);
__syncthreads
();
__syncthreads
();
if
(
*
baseIndex
+
numValid
<=
maxTiles
)
if
(
threadIdx
.
x
==
0
)
for
(
int
i
=
threadIdx
.
x
;
i
<
numValid
;
i
+=
GROUP_SIZE
)
numAtoms
=
atomsToStore
-
tilesToStore
*
TILE_SIZE
;
interactingTiles
[
*
baseIndex
+
i
]
=
temp
[
i
];
if
(
baseIndex
+
tilesToStore
<=
maxTiles
)
{
if
(
threadIdx
.
x
<
tilesToStore
)
interactingTiles
[
baseIndex
+
threadIdx
.
x
]
=
make_ushort2
(
x
,
singlePeriodicCopy
);
for
(
int
i
=
threadIdx
.
x
;
i
<
tilesToStore
*
TILE_SIZE
;
i
+=
blockDim
.
x
)
interactingAtoms
[
baseIndex
*
TILE_SIZE
+
i
]
=
(
i
<
atomsToStore
?
atoms
[
i
]
:
NUM_ATOMS
);
}
}
else
{
__syncthreads
();
if
(
threadIdx
.
x
==
0
)
numAtoms
+=
sum
[
BUFFER_SIZE
-
1
];
}
__syncthreads
();
__syncthreads
();
if
(
threadIdx
.
x
<
numAtoms
&&
!
storePartialTile
)
atoms
[
threadIdx
.
x
]
=
atoms
[
tilesToStore
*
TILE_SIZE
+
threadIdx
.
x
];
}
if
(
numValid
==
0
&&
numAtoms
>
0
&&
finish
)
{
// We didn't have any more tiles to process, but there were some atoms left over from a
// previous call to this function. Save them now.
if
(
threadIdx
.
x
==
0
)
baseIndex
=
atomicAdd
(
interactionCount
,
1
);
__syncthreads
();
if
(
baseIndex
<
maxTiles
)
{
if
(
threadIdx
.
x
==
0
)
interactingTiles
[
baseIndex
]
=
make_ushort2
(
x
,
singlePeriodicCopy
);
if
(
threadIdx
.
x
<
TILE_SIZE
)
interactingAtoms
[
baseIndex
*
TILE_SIZE
+
threadIdx
.
x
]
=
(
threadIdx
.
x
<
numAtoms
?
atoms
[
threadIdx
.
x
]
:
NUM_ATOMS
);
}
}
// Reset the buffer for processing more tiles.
for
(
int
i
=
threadIdx
.
x
;
i
<
BUFFER_SIZE
;
i
+=
blockDim
.
x
)
buffer
[
i
]
=
INVALID
;
}
}
/**
/**
...
@@ -100,139 +265,92 @@ __device__ void storeInteractionData(ushort2* buffer, int* valid, short* sum, us
...
@@ -100,139 +265,92 @@ __device__ void storeInteractionData(ushort2* buffer, int* valid, short* sum, us
*/
*/
extern
"C"
__global__
void
findBlocksWithInteractions
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
real4
*
__restrict__
blockCenter
,
extern
"C"
__global__
void
findBlocksWithInteractions
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockBoundingBox
,
unsigned
int
*
__restrict__
interactionCount
,
ushort2
*
__restrict__
interactingTiles
,
const
real4
*
__restrict__
blockBoundingBox
,
unsigned
int
*
__restrict__
interactionCount
,
ushort2
*
__restrict__
interactingTiles
,
unsigned
int
*
__restrict__
interactionFlags
,
const
real4
*
__restrict__
posq
,
unsigned
int
maxTiles
,
unsigned
int
startTileIndex
,
unsigned
int
*
__restrict__
interactingAtoms
,
const
real4
*
__restrict__
posq
,
unsigned
int
maxTiles
,
unsigned
int
startBlockIndex
,
unsigned
int
numTiles
)
{
unsigned
int
numBlocks
,
real2
*
__restrict__
sortedBlocks
,
const
real4
*
__restrict__
sortedBlockCenter
,
const
real4
*
__restrict__
sortedBlockBoundingBox
,
__shared__
ushort2
buffer
[
BUFFER_SIZE
];
const
unsigned
int
*
__restrict__
exclusionIndices
,
const
unsigned
int
*
__restrict__
exclusionRowIndices
,
real4
*
__restrict__
oldPositions
,
__shared__
int
valid
[
BUFFER_SIZE
];
const
int
*
__restrict__
rebuildNeighborList
)
{
__shared__
unsigned
short
buffer
[
BUFFER_SIZE
];
__shared__
short
sum
[
BUFFER_SIZE
];
__shared__
short
sum
[
BUFFER_SIZE
];
__shared__
ushort2
temp
[
BUFFER_SIZE
];
__shared__
ushort2
temp
[
BUFFER_SIZE
];
__shared__
int
atoms
[
BUFFER_SIZE
+
TILE_SIZE
];
__shared__
real3
posBuffer
[
TILE_SIZE
];
__shared__
int
exclusionsForX
[
MAX_EXCLUSIONS
];
__shared__
int
bufferFull
;
__shared__
int
bufferFull
;
__shared__
int
globalIndex
;
__shared__
int
globalIndex
;
unsigned
int
endTileIndex
=
startTileIndex
+
numTiles
;
__shared__
int
numAtoms
;
if
(
rebuildNeighborList
[
0
]
==
0
)
return
;
// The neighbor list doesn't need to be rebuilt.
int
valuesInBuffer
=
0
;
int
valuesInBuffer
=
0
;
if
(
threadIdx
.
x
==
0
)
if
(
threadIdx
.
x
==
0
)
bufferFull
=
false
;
bufferFull
=
false
;
for
(
int
i
=
0
;
i
<
BUFFER_GROUPS
;
++
i
)
for
(
int
i
=
0
;
i
<
BUFFER_GROUPS
;
++
i
)
valid
[
i
*
GROUP_SIZE
+
threadIdx
.
x
]
=
false
;
buffer
[
i
*
GROUP_SIZE
+
threadIdx
.
x
]
=
INVALID
;
__syncthreads
();
__syncthreads
();
for
(
int
baseIndex
=
startTileIndex
+
blockIdx
.
x
*
blockDim
.
x
;
baseIndex
<
endTileIndex
;
baseIndex
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Identify the pair of blocks to compare.
int
index
=
baseIndex
+
threadIdx
.
x
;
// Loop over blocks sorted by size.
if
(
index
<
endTileIndex
)
{
unsigned
int
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
sqrt
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
index
));
for
(
int
i
=
startBlockIndex
+
blockIdx
.
x
;
i
<
startBlockIndex
+
numBlocks
;
i
+=
gridDim
.
x
)
{
unsigned
int
x
=
(
index
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
threadIdx
.
x
==
blockDim
.
x
-
1
)
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
numAtoms
=
0
;
y
+=
(
x
<
y
?
-
1
:
1
);
real2
sortedKey
=
sortedBlocks
[
i
];
x
=
(
index
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
unsigned
short
x
=
(
unsigned
short
)
sortedKey
.
y
;
}
real4
blockCenterX
=
blockCenter
[
x
];
real4
blockSizeX
=
blockBoundingBox
[
x
];
// Find the distance between the bounding boxes of the two cells
.
// Load exclusion data for block x
.
real4
delta
=
blockCenter
[
x
]
-
blockCenter
[
y
];
const
int
exclusionStart
=
exclusionRowIndices
[
x
];
real4
boxSizea
=
blockBoundingBox
[
x
];
const
int
exclusionEnd
=
exclusionRowIndices
[
x
+
1
];
real4
boxSizeb
=
blockBoundingBox
[
y
];
const
int
numExclusions
=
exclusionEnd
-
exclusionStart
;
for
(
int
j
=
threadIdx
.
x
;
j
<
numExclusions
;
j
+=
blockDim
.
x
)
exclusionsForX
[
j
]
=
exclusionIndices
[
exclusionStart
+
j
];
__syncthreads
();
// Compare it to other blocks after this one in sorted order.
for
(
int
base
=
i
+
1
;
base
<
NUM_BLOCKS
;
base
+=
blockDim
.
x
)
{
int
j
=
base
+
threadIdx
.
x
;
real2
sortedKey2
=
(
j
<
NUM_BLOCKS
?
sortedBlocks
[
j
]
:
make_real2
(
0
));
real4
blockCenterY
=
(
j
<
NUM_BLOCKS
?
sortedBlockCenter
[
j
]
:
make_real4
(
0
));
real4
blockSizeY
=
(
j
<
NUM_BLOCKS
?
sortedBlockBoundingBox
[
j
]
:
make_real4
(
0
));
unsigned
short
y
=
(
unsigned
short
)
sortedKey2
.
y
;
real4
delta
=
blockCenterX
-
blockCenterY
;
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
#endif
delta
.
x
=
max
(
0.0
f
,
fabs
(
delta
.
x
)
-
boxSizea
.
x
-
boxSizeb
.
x
);
delta
.
x
=
max
(
0.0
f
,
fabs
(
delta
.
x
)
-
blockSizeX
.
x
-
blockSizeY
.
x
);
delta
.
y
=
max
(
0.0
f
,
fabs
(
delta
.
y
)
-
boxSizea
.
y
-
boxSizeb
.
y
);
delta
.
y
=
max
(
0.0
f
,
fabs
(
delta
.
y
)
-
blockSizeX
.
y
-
blockSizeY
.
y
);
delta
.
z
=
max
(
0.0
f
,
fabs
(
delta
.
z
)
-
boxSizea
.
z
-
boxSizeb
.
z
);
delta
.
z
=
max
(
0.0
f
,
fabs
(
delta
.
z
)
-
blockSizeX
.
z
-
blockSizeY
.
z
);
if
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
CUTOFF_SQUARED
)
{
bool
hasExclusions
=
false
;
for
(
int
k
=
0
;
k
<
numExclusions
;
k
++
)
hasExclusions
|=
(
exclusionsForX
[
k
]
==
y
);
if
(
j
<
NUM_BLOCKS
&&
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
PADDED_CUTOFF_SQUARED
&&
!
hasExclusions
)
{
// Add this tile to the buffer.
// Add this tile to the buffer.
int
bufferIndex
=
valuesInBuffer
*
GROUP_SIZE
+
threadIdx
.
x
;
int
bufferIndex
=
valuesInBuffer
*
GROUP_SIZE
+
threadIdx
.
x
;
valid
[
bufferIndex
]
=
true
;
buffer
[
bufferIndex
]
=
y
;
buffer
[
bufferIndex
]
=
make_ushort2
(
x
,
y
);
valuesInBuffer
++
;
valuesInBuffer
++
;
if
(
!
bufferFull
&&
valuesInBuffer
==
BUFFER_GROUPS
)
if
(
!
bufferFull
&&
valuesInBuffer
==
BUFFER_GROUPS
)
bufferFull
=
true
;
bufferFull
=
true
;
}
}
}
__syncthreads
();
__syncthreads
();
if
(
bufferFull
)
{
if
(
bufferFull
)
{
storeInteractionData
(
buffer
,
valid
,
sum
,
temp
,
&
globalIndex
,
interactionCount
,
interactingTiles
,
periodicBoxSize
,
invPeriodicBoxSize
,
posq
,
blockCenter
,
block
BoundingBox
,
maxTiles
);
storeInteractionData
(
x
,
buffer
,
sum
,
temp
,
atoms
,
numAtoms
,
globalIndex
,
interactionCount
,
interactingTiles
,
interactingAtoms
,
periodicBoxSize
,
invPeriodicBoxSize
,
posq
,
posBuffer
,
blockCenter
X
,
block
SizeX
,
maxTiles
,
false
);
valuesInBuffer
=
0
;
valuesInBuffer
=
0
;
if
(
threadIdx
.
x
==
0
)
if
(
threadIdx
.
x
==
0
)
bufferFull
=
false
;
bufferFull
=
false
;
__syncthreads
();
__syncthreads
();
}
}
}
}
storeInteractionData
(
buffer
,
valid
,
sum
,
temp
,
&
globalIndex
,
interactionCount
,
interactingTiles
,
periodicBoxSize
,
invPeriodicBoxSize
,
posq
,
blockCenter
,
blockBoundingBox
,
maxTiles
);
storeInteractionData
(
x
,
buffer
,
sum
,
temp
,
atoms
,
numAtoms
,
globalIndex
,
interactionCount
,
interactingTiles
,
interactingAtoms
,
periodicBoxSize
,
invPeriodicBoxSize
,
posq
,
posBuffer
,
blockCenterX
,
blockSizeX
,
maxTiles
,
true
);
}
/**
* Compare each atom in one block to the bounding box of another block, and set
* flags for which ones are interacting.
*/
extern
"C"
__global__
void
findInteractionsWithinBlocks
(
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
real4
*
__restrict__
posq
,
const
ushort2
*
__restrict__
tiles
,
const
real4
*
__restrict__
blockCenter
,
const
real4
*
__restrict__
blockBoundingBox
,
unsigned
int
*
__restrict__
interactionFlags
,
const
unsigned
int
*
__restrict__
interactionCount
,
unsigned
int
maxTiles
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
unsigned
int
numTiles
=
interactionCount
[
0
];
unsigned
int
pos
=
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
unsigned
int
index
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
#if (__CUDA_ARCH__ < 200)
__shared__
unsigned
int
flags
[
128
];
#endif
if
(
numTiles
>
maxTiles
)
return
;
unsigned
int
lasty
=
0xFFFFFFFF
;
real4
apos
;
while
(
pos
<
end
)
{
// Extract the coordinates of this tile
ushort2
tileIndices
=
tiles
[
pos
];
unsigned
int
x
=
tileIndices
.
x
;
unsigned
int
y
=
tileIndices
.
y
;
if
(
x
==
y
)
{
if
(
index
==
0
)
interactionFlags
[
pos
]
=
0xFFFFFFFF
;
}
}
else
{
// Load the bounding box for x and the atom positions for y.
real4
center
=
blockCenter
[
x
];
real4
boxSize
=
blockBoundingBox
[
x
];
if
(
y
!=
lasty
)
apos
=
posq
[
y
*
TILE_SIZE
+
index
];
//
Fin
d the
distance of the atom from the bounding box
.
//
Recor
d the
positions the neighbor list is based on
.
real4
delta
=
apos
-
center
;
for
(
int
i
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
#ifdef USE_PERIODIC
oldPositions
[
i
]
=
posq
[
i
];
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
delta
.
x
=
max
((
real
)
0
,
fabs
(
delta
.
x
)
-
boxSize
.
x
);
delta
.
y
=
max
((
real
)
0
,
fabs
(
delta
.
y
)
-
boxSize
.
y
);
delta
.
z
=
max
((
real
)
0
,
fabs
(
delta
.
z
)
-
boxSize
.
z
);
#if (__CUDA_ARCH__ < 200)
flags
[
threadIdx
.
x
]
=
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
>
CUTOFF_SQUARED
?
0
:
1
<<
index
);
if
(
index
%
4
==
0
)
flags
[
threadIdx
.
x
]
+=
flags
[
threadIdx
.
x
+
1
]
+
flags
[
threadIdx
.
x
+
2
]
+
flags
[
threadIdx
.
x
+
3
];
unsigned
int
allFlags
=
0
;
if
(
index
==
0
)
allFlags
=
flags
[
threadIdx
.
x
]
+
flags
[
threadIdx
.
x
+
4
]
+
flags
[
threadIdx
.
x
+
8
]
+
flags
[
threadIdx
.
x
+
12
]
+
flags
[
threadIdx
.
x
+
16
]
+
flags
[
threadIdx
.
x
+
20
]
+
flags
[
threadIdx
.
x
+
24
]
+
flags
[
threadIdx
.
x
+
28
];
#else
unsigned
int
allFlags
=
__ballot
(
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
<
CUTOFF_SQUARED
);
#endif
// Sum the flags.
if
(
index
==
0
)
{
// Count how many flags are set, and based on that decide whether to compute all interactions
// or only a fraction of them.
int
bits
=
__popc
(
allFlags
);
interactionFlags
[
pos
]
=
(
bits
>
12
?
0xFFFFFFFF
:
allFlags
);
}
lasty
=
y
;
}
pos
++
;
}
}
}
platforms/cuda/src/kernels/gbsaObc1.cu
View file @
93c467b2
#define DIELECTRIC_OFFSET 0.009f
#define DIELECTRIC_OFFSET 0.009f
#define PROBE_RADIUS 0.14f
#define PROBE_RADIUS 0.14f
#define SURFACE_AREA_FACTOR -170.351730667551f //-6.0f*3.14159265358979323846f*0.0216f*1000.0f*0.4184f;
#define SURFACE_AREA_FACTOR -170.351730667551f //-6.0f*3.14159265358979323846f*0.0216f*1000.0f*0.4184f;
#define TILE_SIZE 32
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
/**
/**
...
@@ -70,58 +69,30 @@ typedef struct {
...
@@ -70,58 +69,30 @@ typedef struct {
*/
*/
extern
"C"
__global__
void
computeBornSum
(
unsigned
long
long
*
__restrict__
global_bornSum
,
const
real4
*
__restrict__
posq
,
const
float2
*
__restrict__
global_params
,
extern
"C"
__global__
void
computeBornSum
(
unsigned
long
long
*
__restrict__
global_bornSum
,
const
real4
*
__restrict__
posq
,
const
float2
*
__restrict__
global_params
,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interacti
onFlag
s
,
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
unsigned
int
*
__restrict__
interacti
ngAtom
s
,
#else
#else
unsigned
int
numTiles
,
unsigned
int
numTiles
,
#endif
#endif
unsigned
int
*
exclusionIndices
,
unsigned
int
*
exclusionRowIndices
)
{
const
ushort2
*
__restrict__
exclusionTiles
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
unsigned
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
unsigned
int
pos
=
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
unsigned
int
lasty
=
0xFFFFFFFF
;
__shared__
AtomData1
localData
[
FORCE_WORK_GROUP_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
#ifndef ENABLE_SHUFFLE
__shared__
real
tempBuffer
[
FORCE_WORK_GROUP_SIZE
];
#endif
do
{
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
__shared__
AtomData1
localData
[
FORCE_WORK_GROUP_SIZE
];
unsigned
int
x
,
y
;
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
real
bornSum
=
0
;
real
bornSum
=
0
;
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
sqrt
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
}
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
real4
posq1
=
posq
[
atom1
];
float2
params1
=
global_params
[
atom1
];
float2
params1
=
global_params
[
atom1
];
if
(
pos
>=
end
)
if
(
x
==
y
)
{
;
// This warp is done.
else
if
(
x
==
y
)
{
// This tile is on the diagonal.
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
x
=
posq1
.
x
;
localData
[
threadIdx
.
x
].
x
=
posq1
.
x
;
...
@@ -155,8 +126,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
...
@@ -155,8 +126,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
bornSum
+=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
bornSum
+=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params2
.
y
*
params2
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
(
params2
.
y
*
params2
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
if
(
params1
.
x
<
params2
.
y
-
r
)
bornSum
+=
(
params1
.
x
<
params2
.
y
-
r
?
2.0
f
*
(
RECIP
(
params1
.
x
)
-
l_ij
)
:
0
);
bornSum
+=
2.0
f
*
(
RECIP
(
params1
.
x
)
-
l_ij
);
}
}
}
}
}
}
...
@@ -164,7 +134,6 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
...
@@ -164,7 +134,6 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
else
{
else
{
// This is an off-diagonal tile.
// This is an off-diagonal tile.
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
real4
tempPosq
=
posq
[
j
];
real4
tempPosq
=
posq
[
j
];
localData
[
threadIdx
.
x
].
x
=
tempPosq
.
x
;
localData
[
threadIdx
.
x
].
x
=
tempPosq
.
x
;
...
@@ -174,46 +143,27 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
...
@@ -174,46 +143,27 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
float2
tempParams
=
global_params
[
j
];
float2
tempParams
=
global_params
[
j
];
localData
[
threadIdx
.
x
].
radius
=
tempParams
.
x
;
localData
[
threadIdx
.
x
].
radius
=
tempParams
.
x
;
localData
[
threadIdx
.
x
].
scaledRadius
=
tempParams
.
y
;
localData
[
threadIdx
.
x
].
scaledRadius
=
tempParams
.
y
;
}
localData
[
threadIdx
.
x
].
bornSum
=
0.0
f
;
localData
[
threadIdx
.
x
].
bornSum
=
0.0
f
;
#ifdef USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
bool
computeSubset
=
false
;
if
(
flags
!=
0xFFFFFFFF
)
{
if
(
tgx
<
2
)
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
computeSubset
=
(
exclusionIndex
[
localGroupIndex
]
==
-
1
);
}
if
(
computeSubset
)
{
if
(
flags
==
0
)
{
// No interactions in this tile.
}
else
{
// Compute only a subset of the interactions in this tile.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
// Compute the full set of interactions in this tile.
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
j
].
x
-
posq1
.
x
,
localData
[
tbx
+
j
].
y
-
posq1
.
y
,
localData
[
tbx
+
j
].
z
-
posq1
.
z
);
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
tj
].
x
-
posq1
.
x
,
localData
[
tbx
+
tj
].
y
-
posq1
.
y
,
localData
[
tbx
+
tj
].
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
sum
=
0
;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
t
j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#else
#else
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
t
j
<
NUM_ATOMS
)
{
#endif
#endif
real
invR
=
RSQRT
(
r2
);
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
real
r
=
RECIP
(
invR
);
float2
params2
=
make_float2
(
localData
[
tbx
+
j
].
radius
,
localData
[
tbx
+
j
].
scaledRadius
);
float2
params2
=
make_float2
(
localData
[
tbx
+
t
j
].
radius
,
localData
[
tbx
+
t
j
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
(
params1
.
x
<
rScaledRadiusJ
)
{
if
(
params1
.
x
<
rScaledRadiusJ
)
{
real
l_ij
=
RECIP
(
max
(
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
real
l_ij
=
RECIP
(
max
(
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
...
@@ -223,8 +173,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
...
@@ -223,8 +173,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
bornSum
+=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
bornSum
+=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params2
.
y
*
params2
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
(
params2
.
y
*
params2
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
if
(
params1
.
x
<
params2
.
y
-
r
)
bornSum
+=
(
params1
.
x
<
params2
.
y
-
r
?
2.0
f
*
(
RECIP
(
params1
.
x
)
-
l_ij
)
:
0
);
bornSum
+=
2.0
f
*
(
RECIP
(
params1
.
x
)
-
l_ij
);
}
}
real
rScaledRadiusI
=
r
+
params1
.
y
;
real
rScaledRadiusI
=
r
+
params1
.
y
;
if
(
params2
.
x
<
rScaledRadiusI
)
{
if
(
params2
.
x
<
rScaledRadiusI
)
{
...
@@ -235,37 +184,160 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
...
@@ -235,37 +184,160 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
real
term
=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
real
term
=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params1
.
y
*
params1
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
(
params1
.
y
*
params1
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
if
(
params2
.
x
<
params1
.
y
-
r
)
term
+=
(
params2
.
x
<
params1
.
y
-
r
?
2.0
f
*
(
RECIP
(
params2
.
x
)
-
l_ij
)
:
0
);
term
+=
2.0
f
*
(
RECIP
(
params2
.
x
)
-
l_ij
);
localData
[
tbx
+
tj
].
bornSum
+=
term
;
sum
=
term
;
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
// Sum the forces on atom j
.
// Write results
.
#ifdef ENABLE_SHUFFLE
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
atomicAdd
(
&
global_bornSum
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
bornSum
*
0x100000000
)));
sum
+=
__shfl_xor
(
sum
,
i
,
32
);
if
(
x
!=
y
)
{
if
(
tgx
==
0
)
offset
=
y
*
TILE_SIZE
+
tgx
;
localData
[
tbx
+
j
].
bornSum
+=
sum
;
atomicAdd
(
&
global_bornSum
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
bornSum
*
0x100000000
)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
#else
tempBuffer
[
threadIdx
.
x
]
=
sum
;
int
pos
=
warp
*
numTiles
/
totalWarps
;
if
(
tgx
%
4
==
0
)
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
1
]
+
tempBuffer
[
threadIdx
.
x
+
2
]
+
tempBuffer
[
threadIdx
.
x
+
3
];
if
(
tgx
==
0
)
localData
[
tbx
+
j
].
bornSum
+=
tempBuffer
[
threadIdx
.
x
]
+
tempBuffer
[
threadIdx
.
x
+
4
]
+
tempBuffer
[
threadIdx
.
x
+
8
]
+
tempBuffer
[
threadIdx
.
x
+
12
]
+
tempBuffer
[
threadIdx
.
x
+
16
]
+
tempBuffer
[
threadIdx
.
x
+
20
]
+
tempBuffer
[
threadIdx
.
x
+
24
]
+
tempBuffer
[
threadIdx
.
x
+
28
];
#endif
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
atomIndices
[
FORCE_WORK_GROUP_SIZE
];
__shared__
int
skipTiles
[
FORCE_WORK_GROUP_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
while
(
pos
<
end
)
{
real
bornSum
=
0
;
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
unsigned
int
x
,
y
;
bool
singlePeriodicCopy
=
false
;
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
singlePeriodicCopy
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
}
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
real4
posq1
=
posq
[
atom1
];
float2
params1
=
global_params
[
atom1
];
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
real4
tempPosq
=
posq
[
j
];
localData
[
threadIdx
.
x
].
x
=
tempPosq
.
x
;
localData
[
threadIdx
.
x
].
y
=
tempPosq
.
y
;
localData
[
threadIdx
.
x
].
z
=
tempPosq
.
z
;
localData
[
threadIdx
.
x
].
q
=
tempPosq
.
w
;
float2
tempParams
=
global_params
[
j
];
localData
[
threadIdx
.
x
].
radius
=
tempParams
.
x
;
localData
[
threadIdx
.
x
].
scaledRadius
=
tempParams
.
y
;
localData
[
threadIdx
.
x
].
bornSum
=
0.0
f
;
}
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
localData
[
threadIdx
.
x
].
x
-=
floor
((
localData
[
threadIdx
.
x
].
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
localData
[
threadIdx
.
x
].
y
-=
floor
((
localData
[
threadIdx
.
x
].
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
localData
[
threadIdx
.
x
].
z
-=
floor
((
localData
[
threadIdx
.
x
].
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
tj
].
x
-
posq1
.
x
,
localData
[
tbx
+
tj
].
y
-
posq1
.
y
,
localData
[
tbx
+
tj
].
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
int
atom2
=
atomIndices
[
tbx
+
tj
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
float2
params2
=
make_float2
(
localData
[
tbx
+
tj
].
radius
,
localData
[
tbx
+
tj
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
(
params1
.
x
<
rScaledRadiusJ
)
{
real
l_ij
=
RECIP
(
max
(
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusJ
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
bornSum
+=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params2
.
y
*
params2
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
bornSum
+=
(
params1
.
x
<
params2
.
y
-
r
?
2.0
f
*
(
RECIP
(
params1
.
x
)
-
l_ij
)
:
0
);
}
real
rScaledRadiusI
=
r
+
params1
.
y
;
if
(
params2
.
x
<
rScaledRadiusI
)
{
real
l_ij
=
RECIP
(
max
(
params2
.
x
,
fabs
(
r
-
params1
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusI
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
real
term
=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params1
.
y
*
params1
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
term
+=
(
params2
.
x
<
params1
.
y
-
r
?
2.0
f
*
(
RECIP
(
params2
.
x
)
-
l_ij
)
:
0
);
localData
[
tbx
+
tj
].
bornSum
+=
term
;
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
else
else
#endif
#endif
{
{
// Compute the full set of interactions in this tile
.
// We need to apply periodic boundary conditions separately for each interaction
.
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
tj
].
x
-
posq1
.
x
,
localData
[
tbx
+
tj
].
y
-
posq1
.
y
,
localData
[
tbx
+
tj
].
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
localData
[
tbx
+
tj
].
x
-
posq1
.
x
,
localData
[
tbx
+
tj
].
y
-
posq1
.
y
,
localData
[
tbx
+
tj
].
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
...
@@ -273,10 +345,11 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
...
@@ -273,10 +345,11 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
int
atom2
=
atomIndices
[
tbx
+
tj
];
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
tj
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#else
#else
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
tj
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#endif
#endif
real
invR
=
RSQRT
(
r2
);
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
real
r
=
RECIP
(
invR
);
...
@@ -290,8 +363,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
...
@@ -290,8 +363,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
bornSum
+=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
bornSum
+=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params2
.
y
*
params2
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
(
params2
.
y
*
params2
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
if
(
params1
.
x
<
params2
.
y
-
r
)
bornSum
+=
(
params1
.
x
<
params2
.
y
-
r
?
2.0
f
*
(
RECIP
(
params1
.
x
)
-
l_ij
)
:
0
);
bornSum
+=
2.0
f
*
(
RECIP
(
params1
.
x
)
-
l_ij
);
}
}
real
rScaledRadiusI
=
r
+
params1
.
y
;
real
rScaledRadiusI
=
r
+
params1
.
y
;
if
(
params2
.
x
<
rScaledRadiusI
)
{
if
(
params2
.
x
<
rScaledRadiusI
)
{
...
@@ -302,30 +374,27 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
...
@@ -302,30 +374,27 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
real
term
=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
real
term
=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params1
.
y
*
params1
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
(
params1
.
y
*
params1
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
if
(
params2
.
x
<
params1
.
y
-
r
)
term
+=
(
params2
.
x
<
params1
.
y
-
r
?
2.0
f
*
(
RECIP
(
params2
.
x
)
-
l_ij
)
:
0
);
term
+=
2.0
f
*
(
RECIP
(
params2
.
x
)
-
l_ij
);
localData
[
tbx
+
tj
].
bornSum
+=
term
;
localData
[
tbx
+
tj
].
bornSum
+=
term
;
}
}
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
}
}
// Write results.
// Write results.
if
(
pos
<
end
)
{
atomicAdd
(
&
global_bornSum
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
bornSum
*
0x100000000
)));
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
#ifdef USE_CUTOFF
atomicAdd
(
&
global_bornSum
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
bornSum
*
0x100000000
)));
unsigned
int
atom2
=
atomIndices
[
threadIdx
.
x
];
}
#else
if
(
pos
<
end
&&
x
!=
y
)
{
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomicAdd
(
&
global_bornSum
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
bornSum
*
0x100000000
)));
if
(
atom2
<
PADDED_NUM_ATOMS
)
atomicAdd
(
&
global_bornSum
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
bornSum
*
0x100000000
)));
}
}
lasty
=
y
;
pos
++
;
pos
++
;
}
while
(
pos
<
end
);
}
}
}
typedef
struct
{
typedef
struct
{
...
@@ -342,54 +411,27 @@ typedef struct {
...
@@ -342,54 +411,27 @@ typedef struct {
extern
"C"
__global__
void
computeGBSAForce1
(
unsigned
long
long
*
__restrict__
forceBuffers
,
unsigned
long
long
*
__restrict__
global_bornForce
,
extern
"C"
__global__
void
computeGBSAForce1
(
unsigned
long
long
*
__restrict__
forceBuffers
,
unsigned
long
long
*
__restrict__
global_bornForce
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
global_bornRadii
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
global_bornRadii
,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interacti
onFlag
s
,
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
unsigned
int
*
__restrict__
interacti
ngAtom
s
,
#else
#else
unsigned
int
numTiles
,
unsigned
int
numTiles
,
#endif
#endif
unsigned
int
*
exclusionIndices
,
unsigned
int
*
exclusionRowIndices
)
{
const
ushort2
*
__restrict__
exclusionTiles
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
#ifdef USE_CUTOFF
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
unsigned
int
numTiles
=
interactionCount
[
0
];
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
unsigned
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
unsigned
int
pos
=
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
real
energy
=
0
;
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__shared__
AtomData2
localData
[
FORCE_WORK_GROUP_SIZE
];
__shared__
AtomData2
localData
[
FORCE_WORK_GROUP_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
#ifndef ENABLE_SHUFFLE
__shared__
real4
tempBuffer
[
FORCE_WORK_GROUP_SIZE
];
#endif
do
{
// First loop: process tiles that contain exclusions.
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
unsigned
int
x
,
y
;
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
real4
force
=
make_real4
(
0
);
real4
force
=
make_real4
(
0
);
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
sqrt
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
}
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
real4
posq1
=
posq
[
atom1
];
real
bornRadius1
=
global_bornRadii
[
atom1
];
real
bornRadius1
=
global_bornRadii
[
atom1
];
...
@@ -441,7 +483,6 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
...
@@ -441,7 +483,6 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
else
{
else
{
// This is an off-diagonal tile.
// This is an off-diagonal tile.
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
real4
tempPosq
=
posq
[
j
];
real4
tempPosq
=
posq
[
j
];
localData
[
threadIdx
.
x
].
x
=
tempPosq
.
x
;
localData
[
threadIdx
.
x
].
x
=
tempPosq
.
x
;
...
@@ -449,35 +490,15 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
...
@@ -449,35 +490,15 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
localData
[
threadIdx
.
x
].
z
=
tempPosq
.
z
;
localData
[
threadIdx
.
x
].
z
=
tempPosq
.
z
;
localData
[
threadIdx
.
x
].
q
=
tempPosq
.
w
;
localData
[
threadIdx
.
x
].
q
=
tempPosq
.
w
;
localData
[
threadIdx
.
x
].
bornRadius
=
global_bornRadii
[
j
];
localData
[
threadIdx
.
x
].
bornRadius
=
global_bornRadii
[
j
];
}
localData
[
threadIdx
.
x
].
fx
=
0.0
f
;
localData
[
threadIdx
.
x
].
fx
=
0.0
f
;
localData
[
threadIdx
.
x
].
fy
=
0.0
f
;
localData
[
threadIdx
.
x
].
fy
=
0.0
f
;
localData
[
threadIdx
.
x
].
fz
=
0.0
f
;
localData
[
threadIdx
.
x
].
fz
=
0.0
f
;
localData
[
threadIdx
.
x
].
fw
=
0.0
f
;
localData
[
threadIdx
.
x
].
fw
=
0.0
f
;
#ifdef USE_CUTOFF
unsigned
int
tj
=
tgx
;
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
bool
computeSubset
=
false
;
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
tj
<
NUM_ATOMS
)
{
if
(
flags
!=
0xFFFFFFFF
)
{
real4
posq2
=
make_real4
(
localData
[
tbx
+
tj
].
x
,
localData
[
tbx
+
tj
].
y
,
localData
[
tbx
+
tj
].
z
,
localData
[
tbx
+
tj
].
q
);
if
(
tgx
<
2
)
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
computeSubset
=
(
exclusionIndex
[
localGroupIndex
]
==
-
1
);
}
if
(
computeSubset
)
{
if
(
flags
==
0
)
{
// No interactions in this tile.
}
else
{
// Compute only a subset of the interactions in this tile.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
real4
posq2
=
make_real4
(
localData
[
tbx
+
j
].
x
,
localData
[
tbx
+
j
].
y
,
localData
[
tbx
+
j
].
z
,
localData
[
tbx
+
j
].
q
);
real4
delta
=
make_real4
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
,
0
);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
...
@@ -489,7 +510,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
...
@@ -489,7 +510,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
#endif
#endif
real
invR
=
RSQRT
(
r2
);
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
real
r
=
RECIP
(
invR
);
real
bornRadius2
=
localData
[
tbx
+
j
].
bornRadius
;
real
bornRadius2
=
localData
[
tbx
+
t
j
].
bornRadius
;
real
alpha2_ij
=
bornRadius1
*
bornRadius2
;
real
alpha2_ij
=
bornRadius1
*
bornRadius2
;
real
D_ij
=
r2
*
RECIP
(
4.0
f
*
alpha2_ij
);
real
D_ij
=
r2
*
RECIP
(
4.0
f
*
alpha2_ij
);
real
expTerm
=
EXP
(
-
D_ij
);
real
expTerm
=
EXP
(
-
D_ij
);
...
@@ -499,67 +520,178 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
...
@@ -499,67 +520,178 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
real
Gpol
=
tempEnergy
*
RECIP
(
denominator2
);
real
Gpol
=
tempEnergy
*
RECIP
(
denominator2
);
real
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
real
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
real
dEdR
=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
real
dEdR
=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
#ifdef USE_CUTOFF
if
(
atom1
>=
NUM_ATOMS
||
y
*
TILE_SIZE
+
j
>=
NUM_ATOMS
||
r2
>
CUTOFF_SQUARED
)
{
#else
if
(
atom1
>=
NUM_ATOMS
||
y
*
TILE_SIZE
+
j
>=
NUM_ATOMS
)
{
#endif
dEdR
=
0.0
f
;
dGpol_dalpha2_ij
=
0.0
f
;
tempEnergy
=
0.0
f
;
}
energy
+=
tempEnergy
;
force
.
w
+=
dGpol_dalpha2_ij
*
bornRadius2
;
force
.
w
+=
dGpol_dalpha2_ij
*
bornRadius2
;
energy
+=
tempEnergy
;
delta
*=
dEdR
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
force
.
z
-=
delta
.
z
;
delta
.
w
=
dGpol_dalpha2_ij
*
bornRadius1
;
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
localData
[
tbx
+
tj
].
fw
+=
dGpol_dalpha2_ij
*
bornRadius1
;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
}
}
else
delta
=
make_real4
(
0
);
#endif
#endif
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
// Sum the forces on atom j
.
// Write results
.
#ifdef ENABLE_SHUFFLE
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
for
(
int
i
=
16
;
i
>=
1
;
i
/=
2
)
{
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
delta
.
x
+=
__shfl_xor
(
delta
.
x
,
i
,
32
);
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
delta
.
y
+=
__shfl_xor
(
delta
.
y
,
i
,
32
);
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
delta
.
z
+=
__shfl_xor
(
delta
.
z
,
i
,
32
);
atomicAdd
(
&
global_bornForce
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
w
*
0x100000000
)));
delta
.
w
+=
__shfl_xor
(
delta
.
w
,
i
,
32
);
if
(
x
!=
y
)
{
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0x100000000
)));
atomicAdd
(
&
global_bornForce
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fw
*
0x100000000
)));
}
}
if
(
tgx
==
0
)
{
localData
[
tbx
+
j
].
fx
+=
delta
.
x
;
localData
[
tbx
+
j
].
fy
+=
delta
.
y
;
localData
[
tbx
+
j
].
fz
+=
delta
.
z
;
localData
[
tbx
+
j
].
fw
+=
delta
.
w
;
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
#else
tempBuffer
[
threadIdx
.
x
]
=
delta
;
int
pos
=
warp
*
numTiles
/
totalWarps
;
if
(
tgx
%
4
==
0
)
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
1
]
+
tempBuffer
[
threadIdx
.
x
+
2
]
+
tempBuffer
[
threadIdx
.
x
+
3
];
#endif
if
(
tgx
==
0
)
{
int
skipBase
=
0
;
real4
sum
=
tempBuffer
[
threadIdx
.
x
]
+
tempBuffer
[
threadIdx
.
x
+
4
]
+
tempBuffer
[
threadIdx
.
x
+
8
]
+
tempBuffer
[
threadIdx
.
x
+
12
]
+
tempBuffer
[
threadIdx
.
x
+
16
]
+
tempBuffer
[
threadIdx
.
x
+
20
]
+
tempBuffer
[
threadIdx
.
x
+
24
]
+
tempBuffer
[
threadIdx
.
x
+
28
];
int
currentSkipIndex
=
tbx
;
localData
[
tbx
+
j
].
fx
+=
sum
.
x
;
__shared__
int
atomIndices
[
FORCE_WORK_GROUP_SIZE
];
localData
[
tbx
+
j
].
fy
+=
sum
.
y
;
__shared__
int
skipTiles
[
FORCE_WORK_GROUP_SIZE
];
localData
[
tbx
+
j
].
fz
+=
sum
.
z
;
skipTiles
[
threadIdx
.
x
]
=
-
1
;
localData
[
tbx
+
j
].
fw
+=
sum
.
w
;
while
(
pos
<
end
)
{
real4
force
=
make_real4
(
0
);
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
unsigned
int
x
,
y
;
bool
singlePeriodicCopy
=
false
;
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
singlePeriodicCopy
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
real4
posq1
=
posq
[
atom1
];
real
bornRadius1
=
global_bornRadii
[
atom1
];
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
real4
tempPosq
=
posq
[
j
];
localData
[
threadIdx
.
x
].
x
=
tempPosq
.
x
;
localData
[
threadIdx
.
x
].
y
=
tempPosq
.
y
;
localData
[
threadIdx
.
x
].
z
=
tempPosq
.
z
;
localData
[
threadIdx
.
x
].
q
=
tempPosq
.
w
;
localData
[
threadIdx
.
x
].
bornRadius
=
global_bornRadii
[
j
];
localData
[
threadIdx
.
x
].
fx
=
0.0
f
;
localData
[
threadIdx
.
x
].
fy
=
0.0
f
;
localData
[
threadIdx
.
x
].
fz
=
0.0
f
;
localData
[
threadIdx
.
x
].
fw
=
0.0
f
;
}
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
localData
[
threadIdx
.
x
].
x
-=
floor
((
localData
[
threadIdx
.
x
].
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
localData
[
threadIdx
.
x
].
y
-=
floor
((
localData
[
threadIdx
.
x
].
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
localData
[
threadIdx
.
x
].
z
-=
floor
((
localData
[
threadIdx
.
x
].
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
atomIndices
[
tbx
+
tj
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real4
posq2
=
make_real4
(
localData
[
tbx
+
tj
].
x
,
localData
[
tbx
+
tj
].
y
,
localData
[
tbx
+
tj
].
z
,
localData
[
tbx
+
tj
].
q
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
real
bornRadius2
=
localData
[
tbx
+
tj
].
bornRadius
;
real
alpha2_ij
=
bornRadius1
*
bornRadius2
;
real
D_ij
=
r2
*
RECIP
(
4.0
f
*
alpha2_ij
);
real
expTerm
=
EXP
(
-
D_ij
);
real
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
real
denominator
=
SQRT
(
denominator2
);
real
tempEnergy
=
(
PREFACTOR
*
posq1
.
w
*
posq2
.
w
)
*
RECIP
(
denominator
);
real
Gpol
=
tempEnergy
*
RECIP
(
denominator2
);
real
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
real
dEdR
=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
force
.
w
+=
dGpol_dalpha2_ij
*
bornRadius2
;
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
localData
[
tbx
+
tj
].
fw
+=
dGpol_dalpha2_ij
*
bornRadius1
;
}
}
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
else
else
#endif
#endif
{
{
// Compute the full set of interactions in this tile
.
// We need to apply periodic boundary conditions separately for each interaction
.
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
tj
<
NUM_ATOMS
)
{
int
atom2
=
atomIndices
[
tbx
+
tj
];
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real4
posq2
=
make_real4
(
localData
[
tbx
+
tj
].
x
,
localData
[
tbx
+
tj
].
y
,
localData
[
tbx
+
tj
].
z
,
localData
[
tbx
+
tj
].
q
);
real4
posq2
=
make_real4
(
localData
[
tbx
+
tj
].
x
,
localData
[
tbx
+
tj
].
y
,
localData
[
tbx
+
tj
].
z
,
localData
[
tbx
+
tj
].
q
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
...
@@ -600,27 +732,26 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
...
@@ -600,27 +732,26 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
}
}
// Write results.
// Write results.
if
(
pos
<
end
)
{
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
global_bornForce
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
w
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
#ifdef USE_CUTOFF
atomicAdd
(
&
global_bornForce
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
w
*
0x100000000
)));
unsigned
int
atom2
=
atomIndices
[
threadIdx
.
x
];
#else
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
#endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0x100000000
)));
atomicAdd
(
&
global_bornForce
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fw
*
0x100000000
)));
}
}
if
(
pos
<
end
&&
x
!=
y
)
{
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0x100000000
)));
atomicAdd
(
&
global_bornForce
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fw
*
0x100000000
)));
}
}
lasty
=
y
;
pos
++
;
pos
++
;
}
while
(
pos
<
end
);
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
}
platforms/cuda/src/kernels/integrationUtilities.cu
View file @
93c467b2
...
@@ -24,14 +24,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
...
@@ -24,14 +24,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
state
.
y
^=
state
.
y
<<
5
;
x1
=
sqrt
(
-
2.0
f
*
log
(
x1
));
x1
=
SQRT
(
-
2.0
f
*
LOG
(
x1
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
state
.
w
=
m
;
carry
=
k
>>
30
;
carry
=
k
>>
30
;
float
x2
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
float
x2
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
value
.
x
=
x1
*
cos
(
2.0
f
*
3.14159265
f
*
x2
);
value
.
x
=
x1
*
COS
(
2.0
f
*
3.14159265
f
*
x2
);
// Generate second value.
// Generate second value.
...
@@ -49,14 +49,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
...
@@ -49,14 +49,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
state
.
y
^=
state
.
y
<<
5
;
x3
=
sqrt
(
-
2.0
f
*
log
(
x3
));
x3
=
SQRT
(
-
2.0
f
*
LOG
(
x3
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
state
.
w
=
m
;
carry
=
k
>>
30
;
carry
=
k
>>
30
;
float
x4
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
float
x4
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
value
.
y
=
x3
*
cos
(
2.0
f
*
3.14159265
f
*
x4
);
value
.
y
=
x3
*
COS
(
2.0
f
*
3.14159265
f
*
x4
);
// Generate third value.
// Generate third value.
...
@@ -74,14 +74,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
...
@@ -74,14 +74,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
state
.
y
^=
state
.
y
<<
5
;
x5
=
sqrt
(
-
2.0
f
*
log
(
x5
));
x5
=
SQRT
(
-
2.0
f
*
LOG
(
x5
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
state
.
w
=
m
;
carry
=
k
>>
30
;
carry
=
k
>>
30
;
float
x6
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
float
x6
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
value
.
z
=
x5
*
cos
(
2.0
f
*
3.14159265
f
*
x6
);
value
.
z
=
x5
*
COS
(
2.0
f
*
3.14159265
f
*
x6
);
// Generate fourth value.
// Generate fourth value.
...
@@ -99,14 +99,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
...
@@ -99,14 +99,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
state
.
y
^=
state
.
y
<<
5
;
x7
=
sqrt
(
-
2.0
f
*
log
(
x7
));
x7
=
SQRT
(
-
2.0
f
*
LOG
(
x7
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
state
.
w
=
m
;
carry
=
k
>>
30
;
carry
=
k
>>
30
;
float
x8
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
float
x8
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
value
.
w
=
x7
*
cos
(
2.0
f
*
3.14159265
f
*
x8
);
value
.
w
=
x7
*
COS
(
2.0
f
*
3.14159265
f
*
x8
);
// Record the values.
// Record the values.
...
@@ -412,9 +412,9 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
...
@@ -412,9 +412,9 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed
yaksYd
=
zaksZd
*
xaksXd
-
xaksZd
*
zaksXd
;
mixed
yaksYd
=
zaksZd
*
xaksXd
-
xaksZd
*
zaksXd
;
mixed
zaksYd
=
xaksZd
*
yaksXd
-
yaksZd
*
xaksXd
;
mixed
zaksYd
=
xaksZd
*
yaksXd
-
yaksZd
*
xaksXd
;
mixed
axlng
=
sqrt
(
xaksXd
*
xaksXd
+
yaksXd
*
yaksXd
+
zaksXd
*
zaksXd
);
mixed
axlng
=
SQRT
(
xaksXd
*
xaksXd
+
yaksXd
*
yaksXd
+
zaksXd
*
zaksXd
);
mixed
aylng
=
sqrt
(
xaksYd
*
xaksYd
+
yaksYd
*
yaksYd
+
zaksYd
*
zaksYd
);
mixed
aylng
=
SQRT
(
xaksYd
*
xaksYd
+
yaksYd
*
yaksYd
+
zaksYd
*
zaksYd
);
mixed
azlng
=
sqrt
(
xaksZd
*
xaksZd
+
yaksZd
*
yaksZd
+
zaksZd
*
zaksZd
);
mixed
azlng
=
SQRT
(
xaksZd
*
xaksZd
+
yaksZd
*
yaksZd
+
zaksZd
*
zaksZd
);
mixed
trns11
=
xaksXd
/
axlng
;
mixed
trns11
=
xaksXd
/
axlng
;
mixed
trns21
=
yaksXd
/
axlng
;
mixed
trns21
=
yaksXd
/
axlng
;
mixed
trns31
=
zaksXd
/
axlng
;
mixed
trns31
=
zaksXd
/
axlng
;
...
@@ -440,13 +440,13 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
...
@@ -440,13 +440,13 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
// --- Step2 A2' ---
// --- Step2 A2' ---
float
rc
=
0.5
f
*
params
.
y
;
float
rc
=
0.5
f
*
params
.
y
;
mixed
rb
=
sqrt
(
params
.
x
*
params
.
x
-
rc
*
rc
);
mixed
rb
=
SQRT
(
params
.
x
*
params
.
x
-
rc
*
rc
);
mixed
ra
=
rb
*
(
m1
+
m2
)
*
invTotalMass
;
mixed
ra
=
rb
*
(
m1
+
m2
)
*
invTotalMass
;
rb
-=
ra
;
rb
-=
ra
;
mixed
sinphi
=
za1d
/
ra
;
mixed
sinphi
=
za1d
/
ra
;
mixed
cosphi
=
sqrt
(
1
-
sinphi
*
sinphi
);
mixed
cosphi
=
SQRT
(
1
-
sinphi
*
sinphi
);
mixed
sinpsi
=
(
zb1d
-
zc1d
)
/
(
2
*
rc
*
cosphi
);
mixed
sinpsi
=
(
zb1d
-
zc1d
)
/
(
2
*
rc
*
cosphi
);
mixed
cospsi
=
sqrt
(
1
-
sinpsi
*
sinpsi
);
mixed
cospsi
=
SQRT
(
1
-
sinpsi
*
sinpsi
);
mixed
ya2d
=
ra
*
cosphi
;
mixed
ya2d
=
ra
*
cosphi
;
mixed
xb2d
=
-
rc
*
cospsi
;
mixed
xb2d
=
-
rc
*
cospsi
;
...
@@ -454,7 +454,7 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
...
@@ -454,7 +454,7 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed
yc2d
=
-
rb
*
cosphi
+
rc
*
sinpsi
*
sinphi
;
mixed
yc2d
=
-
rb
*
cosphi
+
rc
*
sinpsi
*
sinphi
;
mixed
xb2d2
=
xb2d
*
xb2d
;
mixed
xb2d2
=
xb2d
*
xb2d
;
mixed
hh2
=
4.0
f
*
xb2d2
+
(
yb2d
-
yc2d
)
*
(
yb2d
-
yc2d
)
+
(
zb1d
-
zc1d
)
*
(
zb1d
-
zc1d
);
mixed
hh2
=
4.0
f
*
xb2d2
+
(
yb2d
-
yc2d
)
*
(
yb2d
-
yc2d
)
+
(
zb1d
-
zc1d
)
*
(
zb1d
-
zc1d
);
mixed
deltx
=
2.0
f
*
xb2d
+
sqrt
(
4.0
f
*
xb2d2
-
hh2
+
params
.
y
*
params
.
y
);
mixed
deltx
=
2.0
f
*
xb2d
+
SQRT
(
4.0
f
*
xb2d2
-
hh2
+
params
.
y
*
params
.
y
);
xb2d
-=
deltx
*
0.5
f
;
xb2d
-=
deltx
*
0.5
f
;
// --- Step3 al,be,ga ---
// --- Step3 al,be,ga ---
...
@@ -464,11 +464,11 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
...
@@ -464,11 +464,11 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed
gamma
=
xb0d
*
yb1d
-
xb1d
*
yb0d
+
xc0d
*
yc1d
-
xc1d
*
yc0d
;
mixed
gamma
=
xb0d
*
yb1d
-
xb1d
*
yb0d
+
xc0d
*
yc1d
-
xc1d
*
yc0d
;
mixed
al2be2
=
alpha
*
alpha
+
beta
*
beta
;
mixed
al2be2
=
alpha
*
alpha
+
beta
*
beta
;
mixed
sintheta
=
(
alpha
*
gamma
-
beta
*
sqrt
(
al2be2
-
gamma
*
gamma
))
/
al2be2
;
mixed
sintheta
=
(
alpha
*
gamma
-
beta
*
SQRT
(
al2be2
-
gamma
*
gamma
))
/
al2be2
;
// --- Step4 A3' ---
// --- Step4 A3' ---
mixed
costheta
=
sqrt
(
1
-
sintheta
*
sintheta
);
mixed
costheta
=
SQRT
(
1
-
sintheta
*
sintheta
);
mixed
xa3d
=
-
ya2d
*
sintheta
;
mixed
xa3d
=
-
ya2d
*
sintheta
;
mixed
ya3d
=
ya2d
*
costheta
;
mixed
ya3d
=
ya2d
*
costheta
;
mixed
za3d
=
za1d
;
mixed
za3d
=
za1d
;
...
@@ -534,9 +534,9 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
...
@@ -534,9 +534,9 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
mixed3
eAB
=
make_mixed3
(
apos1
.
x
-
apos0
.
x
,
apos1
.
y
-
apos0
.
y
,
apos1
.
z
-
apos0
.
z
);
mixed3
eAB
=
make_mixed3
(
apos1
.
x
-
apos0
.
x
,
apos1
.
y
-
apos0
.
y
,
apos1
.
z
-
apos0
.
z
);
mixed3
eBC
=
make_mixed3
(
apos2
.
x
-
apos1
.
x
,
apos2
.
y
-
apos1
.
y
,
apos2
.
z
-
apos1
.
z
);
mixed3
eBC
=
make_mixed3
(
apos2
.
x
-
apos1
.
x
,
apos2
.
y
-
apos1
.
y
,
apos2
.
z
-
apos1
.
z
);
mixed3
eCA
=
make_mixed3
(
apos0
.
x
-
apos2
.
x
,
apos0
.
y
-
apos2
.
y
,
apos0
.
z
-
apos2
.
z
);
mixed3
eCA
=
make_mixed3
(
apos0
.
x
-
apos2
.
x
,
apos0
.
y
-
apos2
.
y
,
apos0
.
z
-
apos2
.
z
);
eAB
*=
rsqrt
(
eAB
.
x
*
eAB
.
x
+
eAB
.
y
*
eAB
.
y
+
eAB
.
z
*
eAB
.
z
);
eAB
*=
RSQRT
(
eAB
.
x
*
eAB
.
x
+
eAB
.
y
*
eAB
.
y
+
eAB
.
z
*
eAB
.
z
);
eBC
*=
rsqrt
(
eBC
.
x
*
eBC
.
x
+
eBC
.
y
*
eBC
.
y
+
eBC
.
z
*
eBC
.
z
);
eBC
*=
RSQRT
(
eBC
.
x
*
eBC
.
x
+
eBC
.
y
*
eBC
.
y
+
eBC
.
z
*
eBC
.
z
);
eCA
*=
rsqrt
(
eCA
.
x
*
eCA
.
x
+
eCA
.
y
*
eCA
.
y
+
eCA
.
z
*
eCA
.
z
);
eCA
*=
RSQRT
(
eCA
.
x
*
eCA
.
x
+
eCA
.
y
*
eCA
.
y
+
eCA
.
z
*
eCA
.
z
);
mixed
vAB
=
(
v1
.
x
-
v0
.
x
)
*
eAB
.
x
+
(
v1
.
y
-
v0
.
y
)
*
eAB
.
y
+
(
v1
.
z
-
v0
.
z
)
*
eAB
.
z
;
mixed
vAB
=
(
v1
.
x
-
v0
.
x
)
*
eAB
.
x
+
(
v1
.
y
-
v0
.
y
)
*
eAB
.
y
+
(
v1
.
z
-
v0
.
z
)
*
eAB
.
z
;
mixed
vBC
=
(
v2
.
x
-
v1
.
x
)
*
eBC
.
x
+
(
v2
.
y
-
v1
.
y
)
*
eBC
.
y
+
(
v2
.
z
-
v1
.
z
)
*
eBC
.
z
;
mixed
vBC
=
(
v2
.
x
-
v1
.
x
)
*
eBC
.
x
+
(
v2
.
y
-
v1
.
y
)
*
eBC
.
y
+
(
v2
.
z
-
v1
.
z
)
*
eBC
.
z
;
mixed
vCA
=
(
v0
.
x
-
v2
.
x
)
*
eCA
.
x
+
(
v0
.
y
-
v2
.
y
)
*
eCA
.
y
+
(
v0
.
z
-
v2
.
z
)
*
eCA
.
z
;
mixed
vCA
=
(
v0
.
x
-
v2
.
x
)
*
eCA
.
x
+
(
v0
.
y
-
v2
.
y
)
*
eCA
.
y
+
(
v0
.
z
-
v2
.
z
)
*
eCA
.
z
;
...
@@ -574,7 +574,8 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
...
@@ -574,7 +574,8 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
/**
/**
* Compute the direction each CCMA constraint is pointing in. This is called once at the beginning of constraint evaluation.
* Compute the direction each CCMA constraint is pointing in. This is called once at the beginning of constraint evaluation.
*/
*/
extern
"C"
__global__
void
computeCCMAConstraintDirections
(
const
int2
*
__restrict__
constraintAtoms
,
mixed4
*
__restrict__
constraintDistance
,
const
real4
*
__restrict__
atomPositions
,
const
real4
*
__restrict__
posqCorrection
)
{
extern
"C"
__global__
void
computeCCMAConstraintDirections
(
const
int2
*
__restrict__
constraintAtoms
,
mixed4
*
__restrict__
constraintDistance
,
const
real4
*
__restrict__
atomPositions
,
const
real4
*
__restrict__
posqCorrection
,
int
*
__restrict__
converged
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_CCMA_CONSTRAINTS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_CCMA_CONSTRAINTS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Compute the direction for this constraint.
// Compute the direction for this constraint.
...
@@ -587,6 +588,10 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric
...
@@ -587,6 +588,10 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric
dir
.
z
=
oldPos1
.
z
-
oldPos2
.
z
;
dir
.
z
=
oldPos1
.
z
-
oldPos2
.
z
;
constraintDistance
[
index
]
=
dir
;
constraintDistance
[
index
]
=
dir
;
}
}
if
(
threadIdx
.
x
==
0
&&
blockIdx
.
x
==
0
)
{
converged
[
0
]
=
1
;
converged
[
1
]
=
0
;
}
}
}
/**
/**
...
@@ -605,6 +610,7 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
...
@@ -605,6 +610,7 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
__syncthreads
();
__syncthreads
();
mixed
lowerTol
=
1
-
2
*
tol
+
tol
*
tol
;
mixed
lowerTol
=
1
-
2
*
tol
+
tol
*
tol
;
mixed
upperTol
=
1
+
2
*
tol
+
tol
*
tol
;
mixed
upperTol
=
1
+
2
*
tol
+
tol
*
tol
;
bool
threadConverged
=
true
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_CCMA_CONSTRAINTS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_CCMA_CONSTRAINTS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Compute the force due to this constraint.
// Compute the force due to this constraint.
...
@@ -620,14 +626,13 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
...
@@ -620,14 +626,13 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
mixed
dist2
=
dir
.
w
*
dir
.
w
;
mixed
dist2
=
dir
.
w
*
dir
.
w
;
mixed
diff
=
dist2
-
rp2
;
mixed
diff
=
dist2
-
rp2
;
delta1
[
index
]
=
(
rrpr
>
d_ij2
*
1e-6
f
?
reducedMass
[
index
]
*
diff
/
rrpr
:
0.0
f
);
delta1
[
index
]
=
(
rrpr
>
d_ij2
*
1e-6
f
?
reducedMass
[
index
]
*
diff
/
rrpr
:
0.0
f
);
threadConverged
&=
(
rp2
>
lowerTol
*
dist2
&&
rp2
<
upperTol
*
dist2
);
// See whether it has converged.
}
if
(
groupConverged
&&
!
threadConverged
)
if
(
groupConverged
&&
(
rp2
<
lowerTol
*
dist2
||
rp2
>
upperTol
*
dist2
))
{
groupConverged
=
0
;
groupConverged
=
0
;
__syncthreads
();
if
(
threadIdx
.
x
==
0
&&
!
groupConverged
)
converged
[
iteration
%
2
]
=
0
;
converged
[
iteration
%
2
]
=
0
;
}
}
}
}
/**
/**
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment