Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
93c467b2
Commit
93c467b2
authored
Mar 22, 2013
by
Peter Eastman
Browse files
Merged 5.1Optimizations branch back to trunk
parent
f6d4557d
Changes
86
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1724 additions
and
1166 deletions
+1724
-1166
platforms/cuda/src/CudaContext.cpp
platforms/cuda/src/CudaContext.cpp
+2
-1
platforms/cuda/src/CudaContext.h
platforms/cuda/src/CudaContext.h
+2
-0
platforms/cuda/src/CudaIntegrationUtilities.cpp
platforms/cuda/src/CudaIntegrationUtilities.cpp
+14
-16
platforms/cuda/src/CudaIntegrationUtilities.h
platforms/cuda/src/CudaIntegrationUtilities.h
+2
-3
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+99
-71
platforms/cuda/src/CudaKernels.h
platforms/cuda/src/CudaKernels.h
+3
-7
platforms/cuda/src/CudaNonbondedUtilities.cpp
platforms/cuda/src/CudaNonbondedUtilities.cpp
+171
-100
platforms/cuda/src/CudaNonbondedUtilities.h
platforms/cuda/src/CudaNonbondedUtilities.h
+35
-23
platforms/cuda/src/CudaParallelKernels.cpp
platforms/cuda/src/CudaParallelKernels.cpp
+15
-17
platforms/cuda/src/CudaParallelKernels.h
platforms/cuda/src/CudaParallelKernels.h
+2
-2
platforms/cuda/src/CudaSort.cpp
platforms/cuda/src/CudaSort.cpp
+42
-32
platforms/cuda/src/CudaSort.h
platforms/cuda/src/CudaSort.h
+3
-2
platforms/cuda/src/kernels/coulombLennardJones.cu
platforms/cuda/src/kernels/coulombLennardJones.cu
+3
-3
platforms/cuda/src/kernels/customGBEnergyN2.cu
platforms/cuda/src/kernels/customGBEnergyN2.cu
+267
-134
platforms/cuda/src/kernels/customGBValueN2.cu
platforms/cuda/src/kernels/customGBValueN2.cu
+223
-168
platforms/cuda/src/kernels/customHbondForce.cu
platforms/cuda/src/kernels/customHbondForce.cu
+2
-2
platforms/cuda/src/kernels/ewald.cu
platforms/cuda/src/kernels/ewald.cu
+6
-6
platforms/cuda/src/kernels/findInteractingBlocks.cu
platforms/cuda/src/kernels/findInteractingBlocks.cu
+261
-143
platforms/cuda/src/kernels/gbsaObc1.cu
platforms/cuda/src/kernels/gbsaObc1.cu
+539
-408
platforms/cuda/src/kernels/integrationUtilities.cu
platforms/cuda/src/kernels/integrationUtilities.cu
+33
-28
No files found.
platforms/cuda/src/CudaContext.cpp
View file @
93c467b2
...
...
@@ -61,7 +61,7 @@ using namespace OpenMM;
using
namespace
std
;
const
int
CudaContext
::
ThreadBlockSize
=
64
;
const
int
CudaContext
::
TileSize
=
32
;
const
int
CudaContext
::
TileSize
=
sizeof
(
tileflags
)
*
8
;
bool
CudaContext
::
hasInitializedCuda
=
false
;
CudaContext
::
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
string
&
precision
,
const
string
&
compiler
,
...
...
@@ -369,6 +369,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
src
<<
"typedef float3 mixed3;
\n
"
;
src
<<
"typedef float4 mixed4;
\n
"
;
}
src
<<
"typedef unsigned int tileflags;
\n
"
;
for
(
map
<
string
,
string
>::
const_iterator
iter
=
defines
.
begin
();
iter
!=
defines
.
end
();
++
iter
)
{
src
<<
"#define "
<<
iter
->
first
;
if
(
!
iter
->
second
.
empty
())
...
...
platforms/cuda/src/CudaContext.h
View file @
93c467b2
...
...
@@ -42,6 +42,8 @@
#include "windowsExportCuda.h"
#include "CudaPlatform.h"
typedef
unsigned
int
tileflags
;
namespace
OpenMM
{
class
CudaArray
;
...
...
platforms/cuda/src/CudaIntegrationUtilities.cpp
View file @
93c467b2
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -99,7 +99,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
posDelta
(
NULL
),
settleAtoms
(
NULL
),
settleParams
(
NULL
),
shakeAtoms
(
NULL
),
shakeParams
(
NULL
),
random
(
NULL
),
randomSeed
(
NULL
),
randomPos
(
0
),
stepSize
(
NULL
),
ccmaAtoms
(
NULL
),
ccmaDistance
(
NULL
),
ccmaReducedMass
(
NULL
),
ccmaAtomConstraints
(
NULL
),
ccmaNumAtomConstraints
(
NULL
),
ccmaConstraintMatrixColumn
(
NULL
),
ccmaConstraintMatrixValue
(
NULL
),
ccmaDelta1
(
NULL
),
ccmaDelta2
(
NULL
),
ccmaConverged
Memory
(
NULL
),
ccmaConstraintMatrixValue
(
NULL
),
ccmaDelta1
(
NULL
),
ccmaDelta2
(
NULL
),
ccmaConverged
(
NULL
),
vsite2AvgAtoms
(
NULL
),
vsite2AvgWeights
(
NULL
),
vsite3AvgAtoms
(
NULL
),
vsite3AvgWeights
(
NULL
),
vsiteOutOfPlaneAtoms
(
NULL
),
vsiteOutOfPlaneWeights
(
NULL
)
{
// Create workspace arrays.
...
...
@@ -466,9 +466,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
ccmaAtoms
=
CudaArray
::
create
<
int2
>
(
context
,
numCCMA
,
"CcmaAtoms"
);
ccmaAtomConstraints
=
CudaArray
::
create
<
int
>
(
context
,
numAtoms
*
maxAtomConstraints
,
"CcmaAtomConstraints"
);
ccmaNumAtomConstraints
=
CudaArray
::
create
<
int
>
(
context
,
numAtoms
,
"CcmaAtomConstraintsIndex"
);
CHECK_RESULT2
(
cuMemHostAlloc
((
void
**
)
&
ccmaConvergedMemory
,
2
*
sizeof
(
int
),
CU_MEMHOSTALLOC_DEVICEMAP
),
"Error allocating pinned memory"
);
CHECK_RESULT2
(
cuMemHostGetDevicePointer
(
&
ccmaConvergedDeviceMemory
,
ccmaConvergedMemory
,
0
),
"Error getting device address for pinned memory"
);
ccmaConstraintMatrixColumn
=
CudaArray
::
create
<
int
>
(
context
,
numCCMA
*
maxRowElements
,
"ConstraintMatrixColumn"
);
ccmaConverged
=
CudaArray
::
create
<
int
>
(
context
,
2
,
"ccmaConverged"
);
vector
<
int2
>
atomsVec
(
ccmaAtoms
->
getSize
());
vector
<
int
>
atomConstraintsVec
(
ccmaAtomConstraints
->
getSize
());
vector
<
int
>
numAtomConstraintsVec
(
ccmaNumAtomConstraints
->
getSize
());
...
...
@@ -680,8 +679,8 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
delete
ccmaDelta1
;
if
(
ccmaDelta2
!=
NULL
)
delete
ccmaDelta2
;
if
(
ccmaConverged
Memory
!=
NULL
)
cuMemFreeHost
(
ccmaConverged
Memory
)
;
if
(
ccmaConverged
!=
NULL
)
delete
ccmaConverged
;
if
(
vsite2AvgAtoms
!=
NULL
)
delete
vsite2AvgAtoms
;
if
(
vsite2AvgWeights
!=
NULL
)
...
...
@@ -734,33 +733,32 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
context
.
executeKernel
(
shakeKernel
,
args
,
shakeAtoms
->
getSize
());
}
if
(
ccmaAtoms
!=
NULL
)
{
void
*
directionsArgs
[]
=
{
&
ccmaAtoms
->
getDevicePointer
(),
&
ccmaDistance
->
getDevicePointer
(),
&
context
.
getPosq
().
getDevicePointer
(),
&
posCorrection
};
void
*
directionsArgs
[]
=
{
&
ccmaAtoms
->
getDevicePointer
(),
&
ccmaDistance
->
getDevicePointer
(),
&
context
.
getPosq
().
getDevicePointer
(),
&
posCorrection
,
&
ccmaConverged
->
getDevicePointer
()
};
context
.
executeKernel
(
ccmaDirectionsKernel
,
directionsArgs
,
ccmaAtoms
->
getSize
());
int
i
;
void
*
forceArgs
[]
=
{
&
ccmaAtoms
->
getDevicePointer
(),
&
ccmaDistance
->
getDevicePointer
(),
constrainVelocities
?
&
context
.
getVelm
().
getDevicePointer
()
:
&
posDelta
->
getDevicePointer
(),
&
ccmaReducedMass
->
getDevicePointer
(),
&
ccmaDelta1
->
getDevicePointer
(),
&
ccmaConvergedDevice
Memory
,
&
ccmaReducedMass
->
getDevicePointer
(),
&
ccmaDelta1
->
getDevicePointer
(),
&
ccmaConverged
->
get
Device
Pointer
()
,
tolPointer
,
&
i
};
void
*
multiplyArgs
[]
=
{
&
ccmaDelta1
->
getDevicePointer
(),
&
ccmaDelta2
->
getDevicePointer
(),
&
ccmaConstraintMatrixColumn
->
getDevicePointer
(),
&
ccmaConstraintMatrixValue
->
getDevicePointer
(),
&
ccmaConvergedDevice
Memory
,
&
i
};
&
ccmaConstraintMatrixColumn
->
getDevicePointer
(),
&
ccmaConstraintMatrixValue
->
getDevicePointer
(),
&
ccmaConverged
->
get
Device
Pointer
()
,
&
i
};
void
*
updateArgs
[]
=
{
&
ccmaNumAtomConstraints
->
getDevicePointer
(),
&
ccmaAtomConstraints
->
getDevicePointer
(),
&
ccmaDistance
->
getDevicePointer
(),
constrainVelocities
?
&
context
.
getVelm
().
getDevicePointer
()
:
&
posDelta
->
getDevicePointer
(),
&
context
.
getVelm
().
getDevicePointer
(),
&
ccmaDelta1
->
getDevicePointer
(),
&
ccmaDelta2
->
getDevicePointer
(),
&
ccmaConvergedDevice
Memory
,
&
i
};
&
ccmaConverged
->
get
Device
Pointer
()
,
&
i
};
const
int
checkInterval
=
4
;
int
*
converged
=
(
int
*
)
context
.
getPinnedBuffer
();
for
(
i
=
0
;
i
<
150
;
i
++
)
{
if
(
i
==
0
)
{
ccmaConvergedMemory
[
0
]
=
1
;
ccmaConvergedMemory
[
1
]
=
0
;
}
context
.
executeKernel
(
ccmaForceKernel
,
forceArgs
,
ccmaAtoms
->
getSize
());
if
((
i
+
1
)
%
checkInterval
==
0
)
if
((
i
+
1
)
%
checkInterval
==
0
)
{
ccmaConverged
->
download
(
converged
,
false
);
CHECK_RESULT2
(
cuEventRecord
(
ccmaEvent
,
0
),
"Error recording event for CCMA"
);
}
context
.
executeKernel
(
ccmaMultiplyKernel
,
multiplyArgs
,
ccmaAtoms
->
getSize
());
context
.
executeKernel
(
ccmaUpdateKernel
,
updateArgs
,
context
.
getNumAtoms
());
if
((
i
+
1
)
%
checkInterval
==
0
)
{
CHECK_RESULT2
(
cuEventSynchronize
(
ccmaEvent
),
"Error synchronizing on event for CCMA"
);
if
(
c
cmaC
onverged
Memory
[
i
%
2
])
if
(
converged
[
i
%
2
])
break
;
}
}
...
...
platforms/cuda/src/CudaIntegrationUtilities.h
View file @
93c467b2
...
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -140,8 +140,7 @@ private:
CudaArray
*
ccmaConstraintMatrixValue
;
CudaArray
*
ccmaDelta1
;
CudaArray
*
ccmaDelta2
;
int
*
ccmaConvergedMemory
;
CUdeviceptr
ccmaConvergedDeviceMemory
;
CudaArray
*
ccmaConverged
;
CUevent
ccmaEvent
;
CudaArray
*
vsite2AvgAtoms
;
CudaArray
*
vsite2AvgWeights
;
...
...
platforms/cuda/src/CudaKernels.cpp
View file @
93c467b2
This diff is collapsed.
Click to expand it.
platforms/cuda/src/CudaKernels.h
View file @
93c467b2
...
...
@@ -557,8 +557,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
CudaCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CudaContext
&
cu
,
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
cu
(
cu
),
hasInitializedFFT
(
false
),
sigmaEpsilon
(
NULL
),
exceptionParams
(
NULL
),
cosSinSums
(
NULL
),
directPmeGrid
(
NULL
),
reciprocalPmeGrid
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeBsplineTheta
(
NULL
),
pmeBsplineDTheta
(
NULL
),
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
sort
(
NULL
)
{
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
sort
(
NULL
)
{
}
~
CudaCalcNonbondedForceKernel
();
/**
...
...
@@ -607,8 +606,6 @@ private:
CudaArray
*
pmeBsplineModuliX
;
CudaArray
*
pmeBsplineModuliY
;
CudaArray
*
pmeBsplineModuliZ
;
CudaArray
*
pmeBsplineTheta
;
CudaArray
*
pmeBsplineDTheta
;
CudaArray
*
pmeAtomRange
;
CudaArray
*
pmeAtomGridIndex
;
CudaSort
*
sort
;
...
...
@@ -617,9 +614,6 @@ private:
CUfunction
ewaldSumsKernel
;
CUfunction
ewaldForcesKernel
;
CUfunction
pmeGridIndexKernel
;
CUfunction
pmeAtomRangeKernel
;
CUfunction
pmeZIndexKernel
;
CUfunction
pmeUpdateBsplinesKernel
;
CUfunction
pmeSpreadChargeKernel
;
CUfunction
pmeFinishSpreadChargeKernel
;
CUfunction
pmeEvalEnergyKernel
;
...
...
@@ -776,6 +770,8 @@ private:
System
&
system
;
CUfunction
pairValueKernel
,
perParticleValueKernel
,
pairEnergyKernel
,
perParticleEnergyKernel
,
gradientChainRuleKernel
;
std
::
vector
<
void
*>
pairValueArgs
,
perParticleValueArgs
,
pairEnergyArgs
,
perParticleEnergyArgs
,
gradientChainRuleArgs
;
std
::
string
pairValueSrc
,
pairEnergySrc
;
std
::
map
<
std
::
string
,
std
::
string
>
pairValueDefines
,
pairEnergyDefines
;
};
/**
...
...
platforms/cuda/src/CudaNonbondedUtilities.cpp
View file @
93c467b2
This diff is collapsed.
Click to expand it.
platforms/cuda/src/CudaNonbondedUtilities.h
View file @
93c467b2
...
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -36,6 +36,8 @@
namespace
OpenMM
{
class
CudaSort
;
/**
* This class provides a generic interface for calculating nonbonded interactions. It does this in two
* ways. First, it can be used to create kernels that evaluate nonbonded interactions. Clients
...
...
@@ -181,10 +183,10 @@ public:
return
*
interactingTiles
;
}
/**
* Get the array containing
flags for
tile
s
with interactions.
* Get the array containing
the atoms in each
tile with interactions.
*/
CudaArray
&
getInteracti
onFlag
s
()
{
return
*
interacti
onFlag
s
;
CudaArray
&
getInteracti
ngAtom
s
()
{
return
*
interacti
ngAtom
s
;
}
/**
* Get the array containing exclusion flags.
...
...
@@ -192,6 +194,12 @@ public:
CudaArray
&
getExclusions
()
{
return
*
exclusions
;
}
/**
* Get the array containing tiles with exclusions.
*/
CudaArray
&
getExclusionTiles
()
{
return
*
exclusionTiles
;
}
/**
* Get the array containing the index into the exclusion array for each tile.
*/
...
...
@@ -217,9 +225,17 @@ public:
return
numTiles
;
}
/**
* Set the range of tiles that should be processed by this context.
* Set whether to add padding to the cutoff distance when building the neighbor list.
* This increases the size of the neighbor list (and thus the cost of computing interactions),
* but also means we don't need to rebuild it every time step. The default value is true,
* since usually this improves performance. For very expensive interactions, however,
* it may be better to set this to false.
*/
void
setUsePadding
(
bool
padding
);
/**
* Set the range of atom blocks and tiles that should be processed by this context.
*/
void
set
TileRange
(
int
startTileIndex
,
int
numTiles
);
void
set
AtomBlockRange
(
double
startFraction
,
double
endFraction
);
/**
* Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
* are assumed to be the same as those for the default interaction Kernel, since this kernel will use
...
...
@@ -232,42 +248,38 @@ public:
* @param isSymmetric specifies whether the interaction is symmetric
*/
CUfunction
createInteractionKernel
(
const
std
::
string
&
source
,
std
::
vector
<
ParameterInfo
>&
params
,
std
::
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
);
/**
* This is a utility routine for locating data in the exclusions array. It takes the (x,y) indices of a tile,
* and returns the location in the array where the data for that tile begins.
*
* This routine requires that x >= y. If not, it will throw an exception.
*
* @param x the x index of the tile
* @param y the y index of the tile
* @param exclusionIndices the content of the exclusionIndices array
* @param exclusionRowIndices the content of the exclusionRowIndices array
* @return the index in the exclusions array at which the data for that tile begins
*/
static
int
findExclusionIndex
(
int
x
,
int
y
,
const
std
::
vector
<
unsigned
int
>&
exclusionIndices
,
const
std
::
vector
<
unsigned
int
>&
exclusionRowIndices
);
private:
class
BlockSortTrait
;
CudaContext
&
context
;
CUfunction
forceKernel
;
CUfunction
findBlockBoundsKernel
;
CUfunction
sortBoxDataKernel
;
CUfunction
findInteractingBlocksKernel
;
CUfunction
findInteractionsWithinBlocksKernel
;
CudaArray
*
exclusionTiles
;
CudaArray
*
exclusions
;
CudaArray
*
exclusionIndices
;
CudaArray
*
exclusionRowIndices
;
CudaArray
*
interactingTiles
;
CudaArray
*
interacti
onFlag
s
;
CudaArray
*
interacti
ngAtom
s
;
CudaArray
*
interactionCount
;
CudaArray
*
blockCenter
;
CudaArray
*
blockBoundingBox
;
std
::
vector
<
void
*>
forceArgs
,
findBlockBoundsArgs
,
findInteractingBlocksArgs
,
findInteractionsWithinBlocksArgs
;
CudaArray
*
sortedBlocks
;
CudaArray
*
sortedBlockCenter
;
CudaArray
*
sortedBlockBoundingBox
;
CudaArray
*
oldPositions
;
CudaArray
*
rebuildNeighborList
;
CudaSort
*
blockSorter
;
std
::
vector
<
void
*>
forceArgs
,
findBlockBoundsArgs
,
sortBoxDataArgs
,
findInteractingBlocksArgs
;
std
::
vector
<
std
::
vector
<
int
>
>
atomExclusions
;
std
::
vector
<
ParameterInfo
>
parameters
;
std
::
vector
<
ParameterInfo
>
arguments
;
std
::
string
kernelSource
;
std
::
map
<
std
::
string
,
std
::
string
>
kernelDefines
;
double
cutoff
;
bool
useCutoff
,
usePeriodic
,
anyExclusions
;
int
startTileIndex
,
numTiles
,
maxTiles
,
numForceThreadBlocks
,
forceThreadBlockSize
,
nonbondedForceGroup
,
numAtoms
;
bool
useCutoff
,
usePeriodic
,
anyExclusions
,
usePadding
;
int
startTileIndex
,
numTiles
,
startBlockIndex
,
numBlocks
,
maxTiles
,
numForceThreadBlocks
,
forceThreadBlockSize
,
nonbondedForceGroup
,
numAtoms
;
};
/**
...
...
platforms/cuda/src/CudaParallelKernels.cpp
View file @
93c467b2
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2011-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -118,7 +118,7 @@ private:
};
CudaParallelCalcForcesAndEnergyKernel
::
CudaParallelCalcForcesAndEnergyKernel
(
string
name
,
const
Platform
&
platform
,
CudaPlatform
::
PlatformData
&
data
)
:
CalcForcesAndEnergyKernel
(
name
,
platform
),
data
(
data
),
completionTimes
(
data
.
contexts
.
size
()),
context
Tile
s
(
data
.
contexts
.
size
()),
contextForces
(
NULL
),
CalcForcesAndEnergyKernel
(
name
,
platform
),
data
(
data
),
completionTimes
(
data
.
contexts
.
size
()),
context
NonbondedFraction
s
(
data
.
contexts
.
size
()),
contextForces
(
NULL
),
pinnedPositionBuffer
(
NULL
),
pinnedForceBuffer
(
NULL
)
{
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
kernels
.
push_back
(
Kernel
(
new
CudaCalcForcesAndEnergyKernel
(
name
,
platform
,
*
data
.
contexts
[
i
])));
...
...
@@ -141,6 +141,8 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
sumKernel
=
cu
.
getKernel
(
module
,
"sumForces"
);
for
(
int
i
=
0
;
i
<
(
int
)
kernels
.
size
();
i
++
)
getKernel
(
i
).
initialize
(
system
);
for
(
int
i
=
0
;
i
<
(
int
)
contextNonbondedFractions
.
size
();
i
++
)
contextNonbondedFractions
[
i
]
=
1
/
(
double
)
contextNonbondedFractions
.
size
();
}
void
CudaParallelCalcForcesAndEnergyKernel
::
beginComputation
(
ContextImpl
&
context
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
)
{
...
...
@@ -184,30 +186,26 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
void
*
args
[]
=
{
&
cu
.
getForce
().
getDevicePointer
(),
&
contextForces
->
getDevicePointer
(),
&
bufferSize
,
&
numBuffers
};
cu
.
executeKernel
(
sumKernel
,
args
,
bufferSize
);
// Balance work between the contexts by transferring a
few
nonbonded
tiles
from the context that
// Balance work between the contexts by transferring a
little
nonbonded
work
from the context that
// finished last to the one that finished first.
int
firstIndex
=
0
,
lastIndex
=
0
;
int
totalTiles
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
completionTimes
.
size
();
i
++
)
{
if
(
completionTimes
[
i
]
<
completionTimes
[
firstIndex
])
firstIndex
=
i
;
if
(
completionTimes
[
i
]
>
completionTimes
[
lastIndex
])
lastIndex
=
i
;
contextTiles
[
i
]
=
data
.
contexts
[
i
]
->
getNonbondedUtilities
().
getNumTiles
();
totalTiles
+=
contextTiles
[
i
];
}
int
tilesToTransfer
=
totalTiles
/
1000
;
if
(
tilesToTransfer
<
1
)
tilesToTransfer
=
1
;
if
(
tilesToTransfer
>
contextTiles
[
lastIndex
])
tilesToTransfer
=
contextTiles
[
lastIndex
];
contextTiles
[
firstIndex
]
+=
tilesToTransfer
;
contextTiles
[
lastIndex
]
-=
tilesToTransfer
;
int
startIndex
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
contextTiles
.
size
();
i
++
)
{
data
.
contexts
[
i
]
->
getNonbondedUtilities
().
setTileRange
(
startIndex
,
contextTiles
[
i
]);
startIndex
+=
contextTiles
[
i
];
}
double
fractionToTransfer
=
min
(
0.001
,
contextNonbondedFractions
[
lastIndex
]);
contextNonbondedFractions
[
firstIndex
]
+=
fractionToTransfer
;
contextNonbondedFractions
[
lastIndex
]
-=
fractionToTransfer
;
double
startFraction
=
0.0
;
for
(
int
i
=
0
;
i
<
(
int
)
contextNonbondedFractions
.
size
();
i
++
)
{
double
endFraction
=
startFraction
+
contextNonbondedFractions
[
i
];
if
(
i
==
contextNonbondedFractions
.
size
()
-
1
)
endFraction
=
1.0
;
// Avoid roundoff error
data
.
contexts
[
i
]
->
getNonbondedUtilities
().
setAtomBlockRange
(
startFraction
,
endFraction
);
startFraction
=
endFraction
;
}
}
return
energy
;
...
...
platforms/cuda/src/CudaParallelKernels.h
View file @
93c467b2
...
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2011-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -80,7 +80,7 @@ private:
CudaPlatform
::
PlatformData
&
data
;
std
::
vector
<
Kernel
>
kernels
;
std
::
vector
<
long
long
>
completionTimes
;
std
::
vector
<
int
>
context
Tile
s
;
std
::
vector
<
double
>
context
NonbondedFraction
s
;
CudaArray
*
contextForces
;
void
*
pinnedPositionBuffer
;
long
long
*
pinnedForceBuffer
;
...
...
platforms/cuda/src/CudaSort.cpp
View file @
93c467b2
...
...
@@ -32,7 +32,7 @@ using namespace OpenMM;
using
namespace
std
;
CudaSort
::
CudaSort
(
CudaContext
&
context
,
SortTrait
*
trait
,
unsigned
int
length
)
:
context
(
context
),
trait
(
trait
),
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
)
{
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
)
,
dataLength
(
length
)
{
// Create kernels.
map
<
string
,
string
>
replacements
;
...
...
@@ -43,6 +43,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
replacements
[
"MAX_KEY"
]
=
trait
->
getMaxKey
();
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
CUmodule
module
=
context
.
createModule
(
context
.
replaceStrings
(
CudaKernelSources
::
sort
,
replacements
));
shortListKernel
=
context
.
getKernel
(
module
,
"sortShortList"
);
computeRangeKernel
=
context
.
getKernel
(
module
,
"computeRange"
);
assignElementsKernel
=
context
.
getKernel
(
module
,
"assignElementsToBuckets"
);
computeBucketPositionsKernel
=
context
.
getKernel
(
module
,
"computeBucketPositions"
);
...
...
@@ -53,15 +54,16 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
int
maxBlockSize
;
cuDeviceGetAttribute
(
&
maxBlockSize
,
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X
,
context
.
getDevice
());
int
maxSharedMem
;
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
isShortList
=
(
length
<=
maxLocalBuffer
);
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxBlockSize
;
rangeKernelSize
*=
2
)
;
positionsKernelSize
=
rangeKernelSize
;
sortKernelSize
=
rangeKernelSize
/
2
;
sortKernelSize
=
(
isShortList
?
rangeKernelSize
/
2
:
rangeKernelSize
/
4
)
;
if
(
rangeKernelSize
>
length
)
rangeKernelSize
=
length
;
int
maxSharedMem
;
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
if
(
sortKernelSize
>
maxLocalBuffer
)
sortKernelSize
=
maxLocalBuffer
;
unsigned
int
targetBucketSize
=
sortKernelSize
/
2
;
...
...
@@ -73,11 +75,13 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
// Create workspace arrays.
if
(
!
isShortList
)
{
dataRange
=
new
CudaArray
(
context
,
2
,
trait
->
getKeySize
(),
"sortDataRange"
);
bucketOffset
=
CudaArray
::
create
<
uint1
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOfElement
=
CudaArray
::
create
<
uint1
>
(
context
,
length
,
"bucketOfElement"
);
offsetInBucket
=
CudaArray
::
create
<
uint1
>
(
context
,
length
,
"offsetInBucket"
);
buckets
=
new
CudaArray
(
context
,
length
,
trait
->
getDataSize
(),
"buckets"
);
}
}
CudaSort
::~
CudaSort
()
{
...
...
@@ -95,22 +99,27 @@ CudaSort::~CudaSort() {
}
void
CudaSort
::
sort
(
CudaArray
&
data
)
{
if
(
data
.
getSize
()
!=
bucketOfElement
->
getSize
()
||
data
.
getElementSize
()
!=
trait
->
getDataSize
())
if
(
data
.
getSize
()
!=
dataLength
||
data
.
getElementSize
()
!=
trait
->
getDataSize
())
throw
OpenMMException
(
"CudaSort called with different data size"
);
if
(
data
.
getSize
()
==
0
)
return
;
if
(
isShortList
)
{
// We can use a simpler sort kernel that does the entire operation at once in local memory.
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
};
context
.
executeKernel
(
shortListKernel
,
sortArgs
,
sortKernelSize
,
sortKernelSize
,
dataLength
*
trait
->
getDataSize
());
}
else
{
// Compute the range of data values.
unsigned
int
dataSize
=
data
.
getSize
();
void
*
rangeArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataSize
,
&
dataRange
->
getDevicePointer
()};
void
*
rangeArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
,
&
dataRange
->
getDevicePointer
()};
context
.
executeKernel
(
computeRangeKernel
,
rangeArgs
,
rangeKernelSize
,
rangeKernelSize
,
rangeKernelSize
*
trait
->
getKeySize
());
// Assign array elements to buckets.
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
context
.
clearBuffer
(
*
bucketOffset
);
void
*
elementsArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
data
Size
,
&
numBuckets
,
&
dataRange
->
getDevicePointer
(),
void
*
elementsArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
data
Length
,
&
numBuckets
,
&
dataRange
->
getDevicePointer
(),
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
assignElementsKernel
,
elementsArgs
,
data
.
getSize
());
...
...
@@ -121,7 +130,7 @@ void CudaSort::sort(CudaArray& data) {
// Copy the data into the buckets.
void
*
copyArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
data
Size
,
&
bucketOffset
->
getDevicePointer
(),
void
*
copyArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
data
Length
,
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
copyToBucketsKernel
,
copyArgs
,
data
.
getSize
());
...
...
@@ -129,4 +138,5 @@ void CudaSort::sort(CudaArray& data) {
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
context
.
executeKernel
(
sortBucketsKernel
,
sortArgs
,
((
data
.
getSize
()
+
sortKernelSize
-
1
)
/
sortKernelSize
)
*
sortKernelSize
,
sortKernelSize
,
sortKernelSize
*
trait
->
getDataSize
());
}
}
platforms/cuda/src/CudaSort.h
View file @
93c467b2
...
...
@@ -92,8 +92,9 @@ private:
CudaArray
*
offsetInBucket
;
CudaArray
*
bucketOffset
;
CudaArray
*
buckets
;
CUfunction
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
unsigned
int
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
CUfunction
shortListKernel
,
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
unsigned
int
dataLength
,
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
bool
isShortList
;
};
/**
...
...
platforms/cuda/src/kernels/coulombLennardJones.cu
View file @
93c467b2
#if USE_EWALD
bool
needCorrection
=
isExcluded
&&
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
;
bool
needCorrection
=
hasExclusions
&&
isExcluded
&&
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
;
if
(
!
isExcluded
||
needCorrection
)
{
real
tempForce
=
0.0
f
;
if
(
r2
<
CUTOFF_SQUARED
||
needCorrection
)
{
const
real
alphaR
=
EWALD_ALPHA
*
r
;
const
real
expAlphaRSqr
=
EXP
(
-
alphaR
*
alphaR
);
...
...
@@ -16,6 +15,7 @@ if (!isExcluded || needCorrection) {
t
*=
t
;
t
*=
t
;
const
real
erfcAlphaR
=
RECIP
(
t
*
t
);
real
tempForce
=
0.0
f
;
if
(
needCorrection
)
{
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
...
...
@@ -36,8 +36,8 @@ if (!isExcluded || needCorrection) {
tempEnergy
+=
prefactor
*
erfcAlphaR
;
#endif
}
}
dEdR
+=
tempForce
*
invR
*
invR
;
}
}
#else
{
...
...
platforms/cuda/src/kernels/customGBEnergyN2.cu
View file @
93c467b2
#define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0x100000000)));
#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0x100000000)));
#define TILE_SIZE 32
typedef
struct
{
real4
posq
;
...
...
@@ -15,88 +14,43 @@ typedef struct {
* Compute a force based on pair interactions.
*/
extern
"C"
__global__
void
computeN2Energy
(
unsigned
long
long
*
__restrict__
forceBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
unsigned
int
*
__restrict__
exclusionIndices
,
const
unsigned
int
*
__restrict__
exclusionRowIndices
,
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
ushort2
*
__restrict__
exclusionTiles
,
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interacti
onFlag
s
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
unsigned
int
*
__restrict__
interacti
ngAtom
s
#else
unsigned
int
numTiles
#endif
PARAMETER_ARGUMENTS
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
unsigned
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
unsigned
int
pos
=
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
do
{
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
unsigned
int
x
,
y
;
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
real3
force
=
make_real3
(
0
);
DECLARE_ATOM1_DERIVATIVES
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
}
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if
(
tgx
<
2
)
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
bool
hasExclusions
=
(
exclusionIndex
[
localGroupIndex
]
>
-
1
);
#else
bool
hasExclusions
=
false
;
unsigned
int
excl
=
exclusions
[
pos
*
TILE_SIZE
+
tgx
];
#endif
if
(
pos
>=
end
)
;
// This warp is done.
else
if
(
x
==
y
)
{
if
(
x
==
y
)
{
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
localData
[
localAtomIndex
].
posq
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
exclusions
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
#endif
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
int
atom2
=
tbx
+
j
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
...
...
@@ -115,6 +69,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
atom2
=
y
*
TILE_SIZE
+
j
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
COMPUTE_INTERACTION
dEdR
/=
-
r
;
...
...
@@ -136,32 +93,16 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
// This is an off-diagonal tile.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData
[
localAtomIndex
].
force
=
make_real3
(
0
);
CLEAR_LOCAL_DERIVATIVES
#ifdef USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
if
(
!
hasExclusions
&&
flags
==
0
)
{
// No interactions in this tile.
}
else
#endif
{
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
0xFFFFFFFF
);
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
));
#endif
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
...
...
@@ -180,6 +121,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
atom2
=
y
*
TILE_SIZE
+
tj
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-
r
;
...
...
@@ -203,27 +147,216 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
lasty
=
y
;
// Write results.
if
(
pos
<
end
)
{
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
STORE_DERIVATIVES_1
}
if
(
pos
<
end
&&
x
!=
y
)
{
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
if
(
x
!=
y
)
{
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
STORE_DERIVATIVES_2
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
int
pos
=
warp
*
numTiles
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
__shared__
int
skipTiles
[
THREAD_BLOCK_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
while
(
pos
<
end
)
{
const
bool
isExcluded
=
false
;
real3
force
=
make_real3
(
0
);
DECLARE_ATOM1_DERIVATIVES
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
unsigned
int
x
,
y
;
bool
singlePeriodicCopy
=
false
;
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
singlePeriodicCopy
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
force
=
make_real3
(
0
);
CLEAR_LOCAL_DERIVATIVES
}
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
localData
[
threadIdx
.
x
].
posq
.
x
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
localData
[
threadIdx
.
x
].
posq
.
y
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
localData
[
threadIdx
.
x
].
posq
.
z
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices
[
tbx
+
tj
];
real
dEdR
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-
r
;
}
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
atom2
=
tbx
+
tj
;
localData
[
atom2
].
force
.
x
+=
delta
.
x
;
localData
[
atom2
].
force
.
y
+=
delta
.
y
;
localData
[
atom2
].
force
.
z
+=
delta
.
z
;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices
[
tbx
+
tj
];
real
dEdR
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-
r
;
}
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
atom2
=
tbx
+
tj
;
localData
[
atom2
].
force
.
x
+=
delta
.
x
;
localData
[
atom2
].
force
.
y
+=
delta
.
y
;
localData
[
atom2
].
force
.
z
+=
delta
.
z
;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
// Write results.
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
unsigned
int
offset
=
atom1
;
STORE_DERIVATIVES_1
#ifdef USE_CUTOFF
unsigned
int
atom2
=
atomIndices
[
threadIdx
.
x
];
#else
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
#endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
offset
=
atom2
;
STORE_DERIVATIVES_2
}
}
pos
++
;
}
while
(
pos
<
end
);
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
platforms/cuda/src/kernels/customGBValueN2.cu
View file @
93c467b2
#define TILE_SIZE 32
typedef
struct
{
real4
posq
;
real
value
,
temp
;
...
...
@@ -13,86 +11,41 @@ typedef struct {
* Compute a value based on pair interactions.
*/
extern
"C"
__global__
void
computeN2Value
(
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
u
nsigned
int
*
__restrict__
exclusion
Indices
,
const
unsigned
int
*
__restrict__
exclusionRowIndic
es
,
unsigned
long
long
*
__restrict__
global_value
,
const
u
short2
*
__restrict__
exclusion
Til
es
,
unsigned
long
long
*
__restrict__
global_value
,
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interacti
onFlag
s
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
unsigned
int
*
__restrict__
interacti
ngAtom
s
#else
unsigned
int
numTiles
#endif
PARAMETER_ARGUMENTS
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
unsigned
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
unsigned
int
pos
=
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
do
{
// Extract the coordinates of this tile
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
unsigned
int
x
,
y
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
real
value
=
0
;
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
}
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if
(
tgx
<
2
)
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
bool
hasExclusions
=
(
exclusionIndex
[
localGroupIndex
]
>
-
1
);
#else
bool
hasExclusions
=
false
;
unsigned
int
excl
=
exclusions
[
pos
*
TILE_SIZE
+
tgx
];
#endif
if
(
pos
>=
end
)
;
// This warp is done.
else
if
(
x
==
y
)
{
if
(
x
==
y
)
{
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
localData
[
localAtomIndex
].
posq
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
exclusions
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
#endif
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
int
atom2
=
tbx
+
j
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
...
...
@@ -112,7 +65,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#ifdef USE_EXCLUSIONS
if
(
!
isExcluded
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
||
atom2
>=
NUM_ATOMS
||
!
(
excl
&
0x1
));
if
(
!
isExcluded
&&
atom1
!=
atom2
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
#endif
...
...
@@ -130,25 +84,17 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
else
{
// This is an off-diagonal tile.
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
localData
[
threadIdx
.
x
].
posq
=
posq
[
j
];
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData
[
threadIdx
.
x
].
value
=
0
;
#ifdef USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
if
(
!
hasExclusions
&&
flags
!=
0xFFFFFFFF
)
{
if
(
flags
==
0
)
{
// No interactions in this tile.
}
else
{
// Compute only a subset of the interactions in this tile.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
int
atom2
=
tbx
+
j
;
localData
[
localAtomIndex
].
value
=
0
;
#ifdef USE_EXCLUSIONS
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
));
#endif
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
...
...
@@ -157,44 +103,162 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
j
;
atom2
=
y
*
TILE_SIZE
+
tj
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
||
atom2
>=
NUM_ATOMS
||
!
(
excl
&
0x1
));
if
(
!
isExcluded
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#endif
COMPUTE_VALUE
}
value
+=
tempValue1
;
localData
[
tbx
+
tj
].
value
+=
tempValue2
;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
localData
[
threadIdx
.
x
].
temp
=
tempValue2
;
// Sum the forces on atom2
.
// Write results
.
if
(
tgx
%
4
==
0
)
localData
[
threadIdx
.
x
].
temp
+=
lo
c
al
Data
[
threadIdx
.
x
+
1
].
temp
+
localData
[
threadIdx
.
x
+
2
].
temp
+
localData
[
threadIdx
.
x
+
3
].
temp
;
if
(
tg
x
=
=
0
)
localData
[
tbx
+
j
].
value
+=
localData
[
threadIdx
.
x
].
temp
+
localData
[
threadIdx
.
x
+
4
].
temp
+
localData
[
threadIdx
.
x
+
8
].
temp
+
localData
[
threadIdx
.
x
+
12
].
temp
+
localData
[
threadIdx
.
x
+
16
].
temp
+
localData
[
threadIdx
.
x
+
20
].
temp
+
localData
[
threadIdx
.
x
+
24
].
temp
+
localData
[
threadIdx
.
x
+
28
].
temp
;
}
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
g
lo
b
al
_value
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
value
*
0x100000000
)))
;
if
(
x
!
=
y
)
{
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
global_value
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
value
*
0x100000000
)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
int
pos
=
warp
*
numTiles
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
__shared__
int
skipTiles
[
THREAD_BLOCK_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
while
(
pos
<
end
)
{
real
value
=
0
;
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
unsigned
int
x
,
y
;
bool
singlePeriodicCopy
=
false
;
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
singlePeriodicCopy
=
tileIndices
.
y
;
}
else
#endif
{
// Compute the full set of interactions in this tile.
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
0xFFFFFFFF
);
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
));
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
value
=
0
;
}
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
localData
[
threadIdx
.
x
].
posq
.
x
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
localData
[
threadIdx
.
x
].
posq
.
y
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
localData
[
threadIdx
.
x
].
posq
.
z
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices
[
tbx
+
tj
];
real
tempValue1
=
0
;
real
tempValue2
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_VALUE
}
value
+=
tempValue1
;
localData
[
tbx
+
tj
].
value
+=
tempValue2
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
...
...
@@ -210,41 +274,32 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
tj
;
atom2
=
atomIndices
[
tbx
+
tj
]
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#ifdef USE_EXCLUSIONS
if
(
!
isExcluded
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#endif
COMPUTE_VALUE
}
value
+=
tempValue1
;
localData
[
tbx
+
tj
].
value
+=
tempValue2
;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
// Write results.
if
(
pos
<
end
)
{
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
global_value
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
value
*
0x100000000
)));
}
if
(
pos
<
end
&&
x
!=
y
)
{
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
global_value
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
value
*
0x100000000
)));
atomicAdd
(
&
global_value
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
value
*
0x100000000
)));
#ifdef USE_CUTOFF
unsigned
int
atom2
=
atomIndices
[
threadIdx
.
x
];
#else
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
#endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
atomicAdd
(
&
global_value
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
value
*
0x100000000
)));
}
lasty
=
y
;
pos
++
;
}
while
(
pos
<
end
);
}
}
platforms/cuda/src/kernels/customHbondForce.cu
View file @
93c467b2
...
...
@@ -48,12 +48,12 @@ inline __device__ real computeAngle(real4 vec1, real4 vec2) {
real3
crossProduct
=
cross
(
vec1
,
vec2
);
real
scale
=
vec1
.
w
*
vec2
.
w
;
angle
=
asin
(
SQRT
(
dot
(
crossProduct
,
crossProduct
)
/
scale
));
angle
=
ASIN
(
SQRT
(
dot
(
crossProduct
,
crossProduct
)
/
scale
));
if
(
cosine
<
0.0
f
)
angle
=
M_PI
-
angle
;
}
else
angle
=
acos
(
cosine
);
angle
=
ACOS
(
cosine
);
return
angle
;
}
...
...
platforms/cuda/src/kernels/ewald.cu
View file @
93c467b2
...
...
@@ -35,11 +35,11 @@ extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuf
for
(
int
atom
=
0
;
atom
<
NUM_ATOMS
;
atom
++
)
{
real4
apos
=
posq
[
atom
];
real
phase
=
apos
.
x
*
kx
;
real2
structureFactor
=
make_real2
(
cos
(
phase
),
sin
(
phase
));
real2
structureFactor
=
make_real2
(
COS
(
phase
),
SIN
(
phase
));
phase
=
apos
.
y
*
ky
;
structureFactor
=
multofReal2
(
structureFactor
,
make_real2
(
cos
(
phase
),
sin
(
phase
)));
structureFactor
=
multofReal2
(
structureFactor
,
make_real2
(
COS
(
phase
),
SIN
(
phase
)));
phase
=
apos
.
z
*
kz
;
structureFactor
=
multofReal2
(
structureFactor
,
make_real2
(
cos
(
phase
),
sin
(
phase
)));
structureFactor
=
multofReal2
(
structureFactor
,
make_real2
(
COS
(
phase
),
SIN
(
phase
)));
sum
+=
apos
.
w
*
structureFactor
;
}
cosSinSum
[
index
]
=
sum
;
...
...
@@ -76,9 +76,9 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
for
(
int
ry
=
lowry
;
ry
<
KMAX_Y
;
ry
++
)
{
real
ky
=
ry
*
reciprocalBoxSize
.
y
;
real
phase
=
apos
.
x
*
kx
;
real2
tab_xy
=
make_real2
(
cos
(
phase
),
sin
(
phase
));
real2
tab_xy
=
make_real2
(
COS
(
phase
),
SIN
(
phase
));
phase
=
apos
.
y
*
ky
;
tab_xy
=
multofReal2
(
tab_xy
,
make_real2
(
cos
(
phase
),
sin
(
phase
)));
tab_xy
=
multofReal2
(
tab_xy
,
make_real2
(
COS
(
phase
),
SIN
(
phase
)));
for
(
int
rz
=
lowrz
;
rz
<
KMAX_Z
;
rz
++
)
{
real
kz
=
rz
*
reciprocalBoxSize
.
z
;
...
...
@@ -88,7 +88,7 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
real
k2
=
kx
*
kx
+
ky
*
ky
+
kz
*
kz
;
real
ak
=
EXP
(
k2
*
EXP_COEFFICIENT
)
/
k2
;
phase
=
apos
.
z
*
kz
;
real2
structureFactor
=
multofReal2
(
tab_xy
,
make_real2
(
cos
(
phase
),
sin
(
phase
)));
real2
structureFactor
=
multofReal2
(
tab_xy
,
make_real2
(
COS
(
phase
),
SIN
(
phase
)));
real2
sum
=
cosSinSum
[
index
];
real
dEdR
=
2
*
reciprocalCoefficient
*
ak
*
apos
.
w
*
(
sum
.
x
*
structureFactor
.
y
-
sum
.
y
*
structureFactor
.
x
);
force
.
x
+=
dEdR
*
kx
;
...
...
platforms/cuda/src/kernels/findInteractingBlocks.cu
View file @
93c467b2
This diff is collapsed.
Click to expand it.
platforms/cuda/src/kernels/gbsaObc1.cu
View file @
93c467b2
This diff is collapsed.
Click to expand it.
platforms/cuda/src/kernels/integrationUtilities.cu
View file @
93c467b2
...
...
@@ -24,14 +24,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
x1
=
sqrt
(
-
2.0
f
*
log
(
x1
));
x1
=
SQRT
(
-
2.0
f
*
LOG
(
x1
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x2
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
value
.
x
=
x1
*
cos
(
2.0
f
*
3.14159265
f
*
x2
);
value
.
x
=
x1
*
COS
(
2.0
f
*
3.14159265
f
*
x2
);
// Generate second value.
...
...
@@ -49,14 +49,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
x3
=
sqrt
(
-
2.0
f
*
log
(
x3
));
x3
=
SQRT
(
-
2.0
f
*
LOG
(
x3
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x4
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
value
.
y
=
x3
*
cos
(
2.0
f
*
3.14159265
f
*
x4
);
value
.
y
=
x3
*
COS
(
2.0
f
*
3.14159265
f
*
x4
);
// Generate third value.
...
...
@@ -74,14 +74,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
x5
=
sqrt
(
-
2.0
f
*
log
(
x5
));
x5
=
SQRT
(
-
2.0
f
*
LOG
(
x5
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x6
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
value
.
z
=
x5
*
cos
(
2.0
f
*
3.14159265
f
*
x6
);
value
.
z
=
x5
*
COS
(
2.0
f
*
3.14159265
f
*
x6
);
// Generate fourth value.
...
...
@@ -99,14 +99,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
x7
=
sqrt
(
-
2.0
f
*
log
(
x7
));
x7
=
SQRT
(
-
2.0
f
*
LOG
(
x7
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
carry
=
k
>>
30
;
float
x8
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
value
.
w
=
x7
*
cos
(
2.0
f
*
3.14159265
f
*
x8
);
value
.
w
=
x7
*
COS
(
2.0
f
*
3.14159265
f
*
x8
);
// Record the values.
...
...
@@ -412,9 +412,9 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed
yaksYd
=
zaksZd
*
xaksXd
-
xaksZd
*
zaksXd
;
mixed
zaksYd
=
xaksZd
*
yaksXd
-
yaksZd
*
xaksXd
;
mixed
axlng
=
sqrt
(
xaksXd
*
xaksXd
+
yaksXd
*
yaksXd
+
zaksXd
*
zaksXd
);
mixed
aylng
=
sqrt
(
xaksYd
*
xaksYd
+
yaksYd
*
yaksYd
+
zaksYd
*
zaksYd
);
mixed
azlng
=
sqrt
(
xaksZd
*
xaksZd
+
yaksZd
*
yaksZd
+
zaksZd
*
zaksZd
);
mixed
axlng
=
SQRT
(
xaksXd
*
xaksXd
+
yaksXd
*
yaksXd
+
zaksXd
*
zaksXd
);
mixed
aylng
=
SQRT
(
xaksYd
*
xaksYd
+
yaksYd
*
yaksYd
+
zaksYd
*
zaksYd
);
mixed
azlng
=
SQRT
(
xaksZd
*
xaksZd
+
yaksZd
*
yaksZd
+
zaksZd
*
zaksZd
);
mixed
trns11
=
xaksXd
/
axlng
;
mixed
trns21
=
yaksXd
/
axlng
;
mixed
trns31
=
zaksXd
/
axlng
;
...
...
@@ -440,13 +440,13 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
// --- Step2 A2' ---
float
rc
=
0.5
f
*
params
.
y
;
mixed
rb
=
sqrt
(
params
.
x
*
params
.
x
-
rc
*
rc
);
mixed
rb
=
SQRT
(
params
.
x
*
params
.
x
-
rc
*
rc
);
mixed
ra
=
rb
*
(
m1
+
m2
)
*
invTotalMass
;
rb
-=
ra
;
mixed
sinphi
=
za1d
/
ra
;
mixed
cosphi
=
sqrt
(
1
-
sinphi
*
sinphi
);
mixed
cosphi
=
SQRT
(
1
-
sinphi
*
sinphi
);
mixed
sinpsi
=
(
zb1d
-
zc1d
)
/
(
2
*
rc
*
cosphi
);
mixed
cospsi
=
sqrt
(
1
-
sinpsi
*
sinpsi
);
mixed
cospsi
=
SQRT
(
1
-
sinpsi
*
sinpsi
);
mixed
ya2d
=
ra
*
cosphi
;
mixed
xb2d
=
-
rc
*
cospsi
;
...
...
@@ -454,7 +454,7 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed
yc2d
=
-
rb
*
cosphi
+
rc
*
sinpsi
*
sinphi
;
mixed
xb2d2
=
xb2d
*
xb2d
;
mixed
hh2
=
4.0
f
*
xb2d2
+
(
yb2d
-
yc2d
)
*
(
yb2d
-
yc2d
)
+
(
zb1d
-
zc1d
)
*
(
zb1d
-
zc1d
);
mixed
deltx
=
2.0
f
*
xb2d
+
sqrt
(
4.0
f
*
xb2d2
-
hh2
+
params
.
y
*
params
.
y
);
mixed
deltx
=
2.0
f
*
xb2d
+
SQRT
(
4.0
f
*
xb2d2
-
hh2
+
params
.
y
*
params
.
y
);
xb2d
-=
deltx
*
0.5
f
;
// --- Step3 al,be,ga ---
...
...
@@ -464,11 +464,11 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed
gamma
=
xb0d
*
yb1d
-
xb1d
*
yb0d
+
xc0d
*
yc1d
-
xc1d
*
yc0d
;
mixed
al2be2
=
alpha
*
alpha
+
beta
*
beta
;
mixed
sintheta
=
(
alpha
*
gamma
-
beta
*
sqrt
(
al2be2
-
gamma
*
gamma
))
/
al2be2
;
mixed
sintheta
=
(
alpha
*
gamma
-
beta
*
SQRT
(
al2be2
-
gamma
*
gamma
))
/
al2be2
;
// --- Step4 A3' ---
mixed
costheta
=
sqrt
(
1
-
sintheta
*
sintheta
);
mixed
costheta
=
SQRT
(
1
-
sintheta
*
sintheta
);
mixed
xa3d
=
-
ya2d
*
sintheta
;
mixed
ya3d
=
ya2d
*
costheta
;
mixed
za3d
=
za1d
;
...
...
@@ -534,9 +534,9 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
mixed3
eAB
=
make_mixed3
(
apos1
.
x
-
apos0
.
x
,
apos1
.
y
-
apos0
.
y
,
apos1
.
z
-
apos0
.
z
);
mixed3
eBC
=
make_mixed3
(
apos2
.
x
-
apos1
.
x
,
apos2
.
y
-
apos1
.
y
,
apos2
.
z
-
apos1
.
z
);
mixed3
eCA
=
make_mixed3
(
apos0
.
x
-
apos2
.
x
,
apos0
.
y
-
apos2
.
y
,
apos0
.
z
-
apos2
.
z
);
eAB
*=
rsqrt
(
eAB
.
x
*
eAB
.
x
+
eAB
.
y
*
eAB
.
y
+
eAB
.
z
*
eAB
.
z
);
eBC
*=
rsqrt
(
eBC
.
x
*
eBC
.
x
+
eBC
.
y
*
eBC
.
y
+
eBC
.
z
*
eBC
.
z
);
eCA
*=
rsqrt
(
eCA
.
x
*
eCA
.
x
+
eCA
.
y
*
eCA
.
y
+
eCA
.
z
*
eCA
.
z
);
eAB
*=
RSQRT
(
eAB
.
x
*
eAB
.
x
+
eAB
.
y
*
eAB
.
y
+
eAB
.
z
*
eAB
.
z
);
eBC
*=
RSQRT
(
eBC
.
x
*
eBC
.
x
+
eBC
.
y
*
eBC
.
y
+
eBC
.
z
*
eBC
.
z
);
eCA
*=
RSQRT
(
eCA
.
x
*
eCA
.
x
+
eCA
.
y
*
eCA
.
y
+
eCA
.
z
*
eCA
.
z
);
mixed
vAB
=
(
v1
.
x
-
v0
.
x
)
*
eAB
.
x
+
(
v1
.
y
-
v0
.
y
)
*
eAB
.
y
+
(
v1
.
z
-
v0
.
z
)
*
eAB
.
z
;
mixed
vBC
=
(
v2
.
x
-
v1
.
x
)
*
eBC
.
x
+
(
v2
.
y
-
v1
.
y
)
*
eBC
.
y
+
(
v2
.
z
-
v1
.
z
)
*
eBC
.
z
;
mixed
vCA
=
(
v0
.
x
-
v2
.
x
)
*
eCA
.
x
+
(
v0
.
y
-
v2
.
y
)
*
eCA
.
y
+
(
v0
.
z
-
v2
.
z
)
*
eCA
.
z
;
...
...
@@ -574,7 +574,8 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
/**
* Compute the direction each CCMA constraint is pointing in. This is called once at the beginning of constraint evaluation.
*/
extern
"C"
__global__
void
computeCCMAConstraintDirections
(
const
int2
*
__restrict__
constraintAtoms
,
mixed4
*
__restrict__
constraintDistance
,
const
real4
*
__restrict__
atomPositions
,
const
real4
*
__restrict__
posqCorrection
)
{
extern
"C"
__global__
void
computeCCMAConstraintDirections
(
const
int2
*
__restrict__
constraintAtoms
,
mixed4
*
__restrict__
constraintDistance
,
const
real4
*
__restrict__
atomPositions
,
const
real4
*
__restrict__
posqCorrection
,
int
*
__restrict__
converged
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_CCMA_CONSTRAINTS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Compute the direction for this constraint.
...
...
@@ -587,6 +588,10 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric
dir
.
z
=
oldPos1
.
z
-
oldPos2
.
z
;
constraintDistance
[
index
]
=
dir
;
}
if
(
threadIdx
.
x
==
0
&&
blockIdx
.
x
==
0
)
{
converged
[
0
]
=
1
;
converged
[
1
]
=
0
;
}
}
/**
...
...
@@ -605,6 +610,7 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
__syncthreads
();
mixed
lowerTol
=
1
-
2
*
tol
+
tol
*
tol
;
mixed
upperTol
=
1
+
2
*
tol
+
tol
*
tol
;
bool
threadConverged
=
true
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_CCMA_CONSTRAINTS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Compute the force due to this constraint.
...
...
@@ -620,14 +626,13 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
mixed
dist2
=
dir
.
w
*
dir
.
w
;
mixed
diff
=
dist2
-
rp2
;
delta1
[
index
]
=
(
rrpr
>
d_ij2
*
1e-6
f
?
reducedMass
[
index
]
*
diff
/
rrpr
:
0.0
f
);
// See whether it has converged.
if
(
groupConverged
&&
(
rp2
<
lowerTol
*
dist2
||
rp2
>
upperTol
*
dist2
))
{
threadConverged
&=
(
rp2
>
lowerTol
*
dist2
&&
rp2
<
upperTol
*
dist2
);
}
if
(
groupConverged
&&
!
threadConverged
)
groupConverged
=
0
;
__syncthreads
();
if
(
threadIdx
.
x
==
0
&&
!
groupConverged
)
converged
[
iteration
%
2
]
=
0
;
}
}
}
/**
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment