Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
93c467b2
Commit
93c467b2
authored
Mar 22, 2013
by
Peter Eastman
Browse files
Merged 5.1Optimizations branch back to trunk
parent
f6d4557d
Changes
86
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1724 additions
and
1166 deletions
+1724
-1166
platforms/cuda/src/CudaContext.cpp
platforms/cuda/src/CudaContext.cpp
+2
-1
platforms/cuda/src/CudaContext.h
platforms/cuda/src/CudaContext.h
+2
-0
platforms/cuda/src/CudaIntegrationUtilities.cpp
platforms/cuda/src/CudaIntegrationUtilities.cpp
+14
-16
platforms/cuda/src/CudaIntegrationUtilities.h
platforms/cuda/src/CudaIntegrationUtilities.h
+2
-3
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+99
-71
platforms/cuda/src/CudaKernels.h
platforms/cuda/src/CudaKernels.h
+3
-7
platforms/cuda/src/CudaNonbondedUtilities.cpp
platforms/cuda/src/CudaNonbondedUtilities.cpp
+171
-100
platforms/cuda/src/CudaNonbondedUtilities.h
platforms/cuda/src/CudaNonbondedUtilities.h
+35
-23
platforms/cuda/src/CudaParallelKernels.cpp
platforms/cuda/src/CudaParallelKernels.cpp
+15
-17
platforms/cuda/src/CudaParallelKernels.h
platforms/cuda/src/CudaParallelKernels.h
+2
-2
platforms/cuda/src/CudaSort.cpp
platforms/cuda/src/CudaSort.cpp
+42
-32
platforms/cuda/src/CudaSort.h
platforms/cuda/src/CudaSort.h
+3
-2
platforms/cuda/src/kernels/coulombLennardJones.cu
platforms/cuda/src/kernels/coulombLennardJones.cu
+3
-3
platforms/cuda/src/kernels/customGBEnergyN2.cu
platforms/cuda/src/kernels/customGBEnergyN2.cu
+267
-134
platforms/cuda/src/kernels/customGBValueN2.cu
platforms/cuda/src/kernels/customGBValueN2.cu
+223
-168
platforms/cuda/src/kernels/customHbondForce.cu
platforms/cuda/src/kernels/customHbondForce.cu
+2
-2
platforms/cuda/src/kernels/ewald.cu
platforms/cuda/src/kernels/ewald.cu
+6
-6
platforms/cuda/src/kernels/findInteractingBlocks.cu
platforms/cuda/src/kernels/findInteractingBlocks.cu
+261
-143
platforms/cuda/src/kernels/gbsaObc1.cu
platforms/cuda/src/kernels/gbsaObc1.cu
+539
-408
platforms/cuda/src/kernels/integrationUtilities.cu
platforms/cuda/src/kernels/integrationUtilities.cu
+33
-28
No files found.
platforms/cuda/src/CudaContext.cpp
View file @
93c467b2
...
@@ -61,7 +61,7 @@ using namespace OpenMM;
...
@@ -61,7 +61,7 @@ using namespace OpenMM;
using
namespace
std
;
using
namespace
std
;
const
int
CudaContext
::
ThreadBlockSize
=
64
;
const
int
CudaContext
::
ThreadBlockSize
=
64
;
const
int
CudaContext
::
TileSize
=
32
;
const
int
CudaContext
::
TileSize
=
sizeof
(
tileflags
)
*
8
;
bool
CudaContext
::
hasInitializedCuda
=
false
;
bool
CudaContext
::
hasInitializedCuda
=
false
;
CudaContext
::
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
string
&
precision
,
const
string
&
compiler
,
CudaContext
::
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
string
&
precision
,
const
string
&
compiler
,
...
@@ -369,6 +369,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
...
@@ -369,6 +369,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
src
<<
"typedef float3 mixed3;
\n
"
;
src
<<
"typedef float3 mixed3;
\n
"
;
src
<<
"typedef float4 mixed4;
\n
"
;
src
<<
"typedef float4 mixed4;
\n
"
;
}
}
src
<<
"typedef unsigned int tileflags;
\n
"
;
for
(
map
<
string
,
string
>::
const_iterator
iter
=
defines
.
begin
();
iter
!=
defines
.
end
();
++
iter
)
{
for
(
map
<
string
,
string
>::
const_iterator
iter
=
defines
.
begin
();
iter
!=
defines
.
end
();
++
iter
)
{
src
<<
"#define "
<<
iter
->
first
;
src
<<
"#define "
<<
iter
->
first
;
if
(
!
iter
->
second
.
empty
())
if
(
!
iter
->
second
.
empty
())
...
...
platforms/cuda/src/CudaContext.h
View file @
93c467b2
...
@@ -42,6 +42,8 @@
...
@@ -42,6 +42,8 @@
#include "windowsExportCuda.h"
#include "windowsExportCuda.h"
#include "CudaPlatform.h"
#include "CudaPlatform.h"
typedef
unsigned
int
tileflags
;
namespace
OpenMM
{
namespace
OpenMM
{
class
CudaArray
;
class
CudaArray
;
...
...
platforms/cuda/src/CudaIntegrationUtilities.cpp
View file @
93c467b2
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2009-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -99,7 +99,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
...
@@ -99,7 +99,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
posDelta
(
NULL
),
settleAtoms
(
NULL
),
settleParams
(
NULL
),
shakeAtoms
(
NULL
),
shakeParams
(
NULL
),
posDelta
(
NULL
),
settleAtoms
(
NULL
),
settleParams
(
NULL
),
shakeAtoms
(
NULL
),
shakeParams
(
NULL
),
random
(
NULL
),
randomSeed
(
NULL
),
randomPos
(
0
),
stepSize
(
NULL
),
ccmaAtoms
(
NULL
),
ccmaDistance
(
NULL
),
random
(
NULL
),
randomSeed
(
NULL
),
randomPos
(
0
),
stepSize
(
NULL
),
ccmaAtoms
(
NULL
),
ccmaDistance
(
NULL
),
ccmaReducedMass
(
NULL
),
ccmaAtomConstraints
(
NULL
),
ccmaNumAtomConstraints
(
NULL
),
ccmaConstraintMatrixColumn
(
NULL
),
ccmaReducedMass
(
NULL
),
ccmaAtomConstraints
(
NULL
),
ccmaNumAtomConstraints
(
NULL
),
ccmaConstraintMatrixColumn
(
NULL
),
ccmaConstraintMatrixValue
(
NULL
),
ccmaDelta1
(
NULL
),
ccmaDelta2
(
NULL
),
ccmaConverged
Memory
(
NULL
),
ccmaConstraintMatrixValue
(
NULL
),
ccmaDelta1
(
NULL
),
ccmaDelta2
(
NULL
),
ccmaConverged
(
NULL
),
vsite2AvgAtoms
(
NULL
),
vsite2AvgWeights
(
NULL
),
vsite3AvgAtoms
(
NULL
),
vsite3AvgWeights
(
NULL
),
vsite2AvgAtoms
(
NULL
),
vsite2AvgWeights
(
NULL
),
vsite3AvgAtoms
(
NULL
),
vsite3AvgWeights
(
NULL
),
vsiteOutOfPlaneAtoms
(
NULL
),
vsiteOutOfPlaneWeights
(
NULL
)
{
vsiteOutOfPlaneAtoms
(
NULL
),
vsiteOutOfPlaneWeights
(
NULL
)
{
// Create workspace arrays.
// Create workspace arrays.
...
@@ -466,9 +466,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
...
@@ -466,9 +466,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
ccmaAtoms
=
CudaArray
::
create
<
int2
>
(
context
,
numCCMA
,
"CcmaAtoms"
);
ccmaAtoms
=
CudaArray
::
create
<
int2
>
(
context
,
numCCMA
,
"CcmaAtoms"
);
ccmaAtomConstraints
=
CudaArray
::
create
<
int
>
(
context
,
numAtoms
*
maxAtomConstraints
,
"CcmaAtomConstraints"
);
ccmaAtomConstraints
=
CudaArray
::
create
<
int
>
(
context
,
numAtoms
*
maxAtomConstraints
,
"CcmaAtomConstraints"
);
ccmaNumAtomConstraints
=
CudaArray
::
create
<
int
>
(
context
,
numAtoms
,
"CcmaAtomConstraintsIndex"
);
ccmaNumAtomConstraints
=
CudaArray
::
create
<
int
>
(
context
,
numAtoms
,
"CcmaAtomConstraintsIndex"
);
CHECK_RESULT2
(
cuMemHostAlloc
((
void
**
)
&
ccmaConvergedMemory
,
2
*
sizeof
(
int
),
CU_MEMHOSTALLOC_DEVICEMAP
),
"Error allocating pinned memory"
);
CHECK_RESULT2
(
cuMemHostGetDevicePointer
(
&
ccmaConvergedDeviceMemory
,
ccmaConvergedMemory
,
0
),
"Error getting device address for pinned memory"
);
ccmaConstraintMatrixColumn
=
CudaArray
::
create
<
int
>
(
context
,
numCCMA
*
maxRowElements
,
"ConstraintMatrixColumn"
);
ccmaConstraintMatrixColumn
=
CudaArray
::
create
<
int
>
(
context
,
numCCMA
*
maxRowElements
,
"ConstraintMatrixColumn"
);
ccmaConverged
=
CudaArray
::
create
<
int
>
(
context
,
2
,
"ccmaConverged"
);
vector
<
int2
>
atomsVec
(
ccmaAtoms
->
getSize
());
vector
<
int2
>
atomsVec
(
ccmaAtoms
->
getSize
());
vector
<
int
>
atomConstraintsVec
(
ccmaAtomConstraints
->
getSize
());
vector
<
int
>
atomConstraintsVec
(
ccmaAtomConstraints
->
getSize
());
vector
<
int
>
numAtomConstraintsVec
(
ccmaNumAtomConstraints
->
getSize
());
vector
<
int
>
numAtomConstraintsVec
(
ccmaNumAtomConstraints
->
getSize
());
...
@@ -680,8 +679,8 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
...
@@ -680,8 +679,8 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
delete
ccmaDelta1
;
delete
ccmaDelta1
;
if
(
ccmaDelta2
!=
NULL
)
if
(
ccmaDelta2
!=
NULL
)
delete
ccmaDelta2
;
delete
ccmaDelta2
;
if
(
ccmaConverged
Memory
!=
NULL
)
if
(
ccmaConverged
!=
NULL
)
cuMemFreeHost
(
ccmaConverged
Memory
)
;
delete
ccmaConverged
;
if
(
vsite2AvgAtoms
!=
NULL
)
if
(
vsite2AvgAtoms
!=
NULL
)
delete
vsite2AvgAtoms
;
delete
vsite2AvgAtoms
;
if
(
vsite2AvgWeights
!=
NULL
)
if
(
vsite2AvgWeights
!=
NULL
)
...
@@ -734,33 +733,32 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
...
@@ -734,33 +733,32 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
context
.
executeKernel
(
shakeKernel
,
args
,
shakeAtoms
->
getSize
());
context
.
executeKernel
(
shakeKernel
,
args
,
shakeAtoms
->
getSize
());
}
}
if
(
ccmaAtoms
!=
NULL
)
{
if
(
ccmaAtoms
!=
NULL
)
{
void
*
directionsArgs
[]
=
{
&
ccmaAtoms
->
getDevicePointer
(),
&
ccmaDistance
->
getDevicePointer
(),
&
context
.
getPosq
().
getDevicePointer
(),
&
posCorrection
};
void
*
directionsArgs
[]
=
{
&
ccmaAtoms
->
getDevicePointer
(),
&
ccmaDistance
->
getDevicePointer
(),
&
context
.
getPosq
().
getDevicePointer
(),
&
posCorrection
,
&
ccmaConverged
->
getDevicePointer
()
};
context
.
executeKernel
(
ccmaDirectionsKernel
,
directionsArgs
,
ccmaAtoms
->
getSize
());
context
.
executeKernel
(
ccmaDirectionsKernel
,
directionsArgs
,
ccmaAtoms
->
getSize
());
int
i
;
int
i
;
void
*
forceArgs
[]
=
{
&
ccmaAtoms
->
getDevicePointer
(),
&
ccmaDistance
->
getDevicePointer
(),
void
*
forceArgs
[]
=
{
&
ccmaAtoms
->
getDevicePointer
(),
&
ccmaDistance
->
getDevicePointer
(),
constrainVelocities
?
&
context
.
getVelm
().
getDevicePointer
()
:
&
posDelta
->
getDevicePointer
(),
constrainVelocities
?
&
context
.
getVelm
().
getDevicePointer
()
:
&
posDelta
->
getDevicePointer
(),
&
ccmaReducedMass
->
getDevicePointer
(),
&
ccmaDelta1
->
getDevicePointer
(),
&
ccmaConvergedDevice
Memory
,
&
ccmaReducedMass
->
getDevicePointer
(),
&
ccmaDelta1
->
getDevicePointer
(),
&
ccmaConverged
->
get
Device
Pointer
()
,
tolPointer
,
&
i
};
tolPointer
,
&
i
};
void
*
multiplyArgs
[]
=
{
&
ccmaDelta1
->
getDevicePointer
(),
&
ccmaDelta2
->
getDevicePointer
(),
void
*
multiplyArgs
[]
=
{
&
ccmaDelta1
->
getDevicePointer
(),
&
ccmaDelta2
->
getDevicePointer
(),
&
ccmaConstraintMatrixColumn
->
getDevicePointer
(),
&
ccmaConstraintMatrixValue
->
getDevicePointer
(),
&
ccmaConvergedDevice
Memory
,
&
i
};
&
ccmaConstraintMatrixColumn
->
getDevicePointer
(),
&
ccmaConstraintMatrixValue
->
getDevicePointer
(),
&
ccmaConverged
->
get
Device
Pointer
()
,
&
i
};
void
*
updateArgs
[]
=
{
&
ccmaNumAtomConstraints
->
getDevicePointer
(),
&
ccmaAtomConstraints
->
getDevicePointer
(),
&
ccmaDistance
->
getDevicePointer
(),
void
*
updateArgs
[]
=
{
&
ccmaNumAtomConstraints
->
getDevicePointer
(),
&
ccmaAtomConstraints
->
getDevicePointer
(),
&
ccmaDistance
->
getDevicePointer
(),
constrainVelocities
?
&
context
.
getVelm
().
getDevicePointer
()
:
&
posDelta
->
getDevicePointer
(),
constrainVelocities
?
&
context
.
getVelm
().
getDevicePointer
()
:
&
posDelta
->
getDevicePointer
(),
&
context
.
getVelm
().
getDevicePointer
(),
&
ccmaDelta1
->
getDevicePointer
(),
&
ccmaDelta2
->
getDevicePointer
(),
&
context
.
getVelm
().
getDevicePointer
(),
&
ccmaDelta1
->
getDevicePointer
(),
&
ccmaDelta2
->
getDevicePointer
(),
&
ccmaConvergedDevice
Memory
,
&
i
};
&
ccmaConverged
->
get
Device
Pointer
()
,
&
i
};
const
int
checkInterval
=
4
;
const
int
checkInterval
=
4
;
int
*
converged
=
(
int
*
)
context
.
getPinnedBuffer
();
for
(
i
=
0
;
i
<
150
;
i
++
)
{
for
(
i
=
0
;
i
<
150
;
i
++
)
{
if
(
i
==
0
)
{
ccmaConvergedMemory
[
0
]
=
1
;
ccmaConvergedMemory
[
1
]
=
0
;
}
context
.
executeKernel
(
ccmaForceKernel
,
forceArgs
,
ccmaAtoms
->
getSize
());
context
.
executeKernel
(
ccmaForceKernel
,
forceArgs
,
ccmaAtoms
->
getSize
());
if
((
i
+
1
)
%
checkInterval
==
0
)
if
((
i
+
1
)
%
checkInterval
==
0
)
{
ccmaConverged
->
download
(
converged
,
false
);
CHECK_RESULT2
(
cuEventRecord
(
ccmaEvent
,
0
),
"Error recording event for CCMA"
);
CHECK_RESULT2
(
cuEventRecord
(
ccmaEvent
,
0
),
"Error recording event for CCMA"
);
}
context
.
executeKernel
(
ccmaMultiplyKernel
,
multiplyArgs
,
ccmaAtoms
->
getSize
());
context
.
executeKernel
(
ccmaMultiplyKernel
,
multiplyArgs
,
ccmaAtoms
->
getSize
());
context
.
executeKernel
(
ccmaUpdateKernel
,
updateArgs
,
context
.
getNumAtoms
());
context
.
executeKernel
(
ccmaUpdateKernel
,
updateArgs
,
context
.
getNumAtoms
());
if
((
i
+
1
)
%
checkInterval
==
0
)
{
if
((
i
+
1
)
%
checkInterval
==
0
)
{
CHECK_RESULT2
(
cuEventSynchronize
(
ccmaEvent
),
"Error synchronizing on event for CCMA"
);
CHECK_RESULT2
(
cuEventSynchronize
(
ccmaEvent
),
"Error synchronizing on event for CCMA"
);
if
(
c
cmaC
onverged
Memory
[
i
%
2
])
if
(
converged
[
i
%
2
])
break
;
break
;
}
}
}
}
...
...
platforms/cuda/src/CudaIntegrationUtilities.h
View file @
93c467b2
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2009-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -140,8 +140,7 @@ private:
...
@@ -140,8 +140,7 @@ private:
CudaArray
*
ccmaConstraintMatrixValue
;
CudaArray
*
ccmaConstraintMatrixValue
;
CudaArray
*
ccmaDelta1
;
CudaArray
*
ccmaDelta1
;
CudaArray
*
ccmaDelta2
;
CudaArray
*
ccmaDelta2
;
int
*
ccmaConvergedMemory
;
CudaArray
*
ccmaConverged
;
CUdeviceptr
ccmaConvergedDeviceMemory
;
CUevent
ccmaEvent
;
CUevent
ccmaEvent
;
CudaArray
*
vsite2AvgAtoms
;
CudaArray
*
vsite2AvgAtoms
;
CudaArray
*
vsite2AvgWeights
;
CudaArray
*
vsite2AvgWeights
;
...
...
platforms/cuda/src/CudaKernels.cpp
View file @
93c467b2
This diff is collapsed.
Click to expand it.
platforms/cuda/src/CudaKernels.h
View file @
93c467b2
...
@@ -557,8 +557,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
...
@@ -557,8 +557,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
public:
CudaCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CudaContext
&
cu
,
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
CudaCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CudaContext
&
cu
,
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
cu
(
cu
),
hasInitializedFFT
(
false
),
sigmaEpsilon
(
NULL
),
exceptionParams
(
NULL
),
cosSinSums
(
NULL
),
directPmeGrid
(
NULL
),
reciprocalPmeGrid
(
NULL
),
cu
(
cu
),
hasInitializedFFT
(
false
),
sigmaEpsilon
(
NULL
),
exceptionParams
(
NULL
),
cosSinSums
(
NULL
),
directPmeGrid
(
NULL
),
reciprocalPmeGrid
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeBsplineTheta
(
NULL
),
pmeBsplineDTheta
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
sort
(
NULL
)
{
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
sort
(
NULL
)
{
}
}
~
CudaCalcNonbondedForceKernel
();
~
CudaCalcNonbondedForceKernel
();
/**
/**
...
@@ -607,8 +606,6 @@ private:
...
@@ -607,8 +606,6 @@ private:
CudaArray
*
pmeBsplineModuliX
;
CudaArray
*
pmeBsplineModuliX
;
CudaArray
*
pmeBsplineModuliY
;
CudaArray
*
pmeBsplineModuliY
;
CudaArray
*
pmeBsplineModuliZ
;
CudaArray
*
pmeBsplineModuliZ
;
CudaArray
*
pmeBsplineTheta
;
CudaArray
*
pmeBsplineDTheta
;
CudaArray
*
pmeAtomRange
;
CudaArray
*
pmeAtomRange
;
CudaArray
*
pmeAtomGridIndex
;
CudaArray
*
pmeAtomGridIndex
;
CudaSort
*
sort
;
CudaSort
*
sort
;
...
@@ -617,9 +614,6 @@ private:
...
@@ -617,9 +614,6 @@ private:
CUfunction
ewaldSumsKernel
;
CUfunction
ewaldSumsKernel
;
CUfunction
ewaldForcesKernel
;
CUfunction
ewaldForcesKernel
;
CUfunction
pmeGridIndexKernel
;
CUfunction
pmeGridIndexKernel
;
CUfunction
pmeAtomRangeKernel
;
CUfunction
pmeZIndexKernel
;
CUfunction
pmeUpdateBsplinesKernel
;
CUfunction
pmeSpreadChargeKernel
;
CUfunction
pmeSpreadChargeKernel
;
CUfunction
pmeFinishSpreadChargeKernel
;
CUfunction
pmeFinishSpreadChargeKernel
;
CUfunction
pmeEvalEnergyKernel
;
CUfunction
pmeEvalEnergyKernel
;
...
@@ -776,6 +770,8 @@ private:
...
@@ -776,6 +770,8 @@ private:
System
&
system
;
System
&
system
;
CUfunction
pairValueKernel
,
perParticleValueKernel
,
pairEnergyKernel
,
perParticleEnergyKernel
,
gradientChainRuleKernel
;
CUfunction
pairValueKernel
,
perParticleValueKernel
,
pairEnergyKernel
,
perParticleEnergyKernel
,
gradientChainRuleKernel
;
std
::
vector
<
void
*>
pairValueArgs
,
perParticleValueArgs
,
pairEnergyArgs
,
perParticleEnergyArgs
,
gradientChainRuleArgs
;
std
::
vector
<
void
*>
pairValueArgs
,
perParticleValueArgs
,
pairEnergyArgs
,
perParticleEnergyArgs
,
gradientChainRuleArgs
;
std
::
string
pairValueSrc
,
pairEnergySrc
;
std
::
map
<
std
::
string
,
std
::
string
>
pairValueDefines
,
pairEnergyDefines
;
};
};
/**
/**
...
...
platforms/cuda/src/CudaNonbondedUtilities.cpp
View file @
93c467b2
This diff is collapsed.
Click to expand it.
platforms/cuda/src/CudaNonbondedUtilities.h
View file @
93c467b2
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2009-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -36,6 +36,8 @@
...
@@ -36,6 +36,8 @@
namespace
OpenMM
{
namespace
OpenMM
{
class
CudaSort
;
/**
/**
* This class provides a generic interface for calculating nonbonded interactions. It does this in two
* This class provides a generic interface for calculating nonbonded interactions. It does this in two
* ways. First, it can be used to create kernels that evaluate nonbonded interactions. Clients
* ways. First, it can be used to create kernels that evaluate nonbonded interactions. Clients
...
@@ -181,10 +183,10 @@ public:
...
@@ -181,10 +183,10 @@ public:
return
*
interactingTiles
;
return
*
interactingTiles
;
}
}
/**
/**
* Get the array containing
flags for
tile
s
with interactions.
* Get the array containing
the atoms in each
tile with interactions.
*/
*/
CudaArray
&
getInteracti
onFlag
s
()
{
CudaArray
&
getInteracti
ngAtom
s
()
{
return
*
interacti
onFlag
s
;
return
*
interacti
ngAtom
s
;
}
}
/**
/**
* Get the array containing exclusion flags.
* Get the array containing exclusion flags.
...
@@ -192,6 +194,12 @@ public:
...
@@ -192,6 +194,12 @@ public:
CudaArray
&
getExclusions
()
{
CudaArray
&
getExclusions
()
{
return
*
exclusions
;
return
*
exclusions
;
}
}
/**
* Get the array containing tiles with exclusions.
*/
CudaArray
&
getExclusionTiles
()
{
return
*
exclusionTiles
;
}
/**
/**
* Get the array containing the index into the exclusion array for each tile.
* Get the array containing the index into the exclusion array for each tile.
*/
*/
...
@@ -217,9 +225,17 @@ public:
...
@@ -217,9 +225,17 @@ public:
return
numTiles
;
return
numTiles
;
}
}
/**
/**
* Set the range of tiles that should be processed by this context.
* Set whether to add padding to the cutoff distance when building the neighbor list.
* This increases the size of the neighbor list (and thus the cost of computing interactions),
* but also means we don't need to rebuild it every time step. The default value is true,
* since usually this improves performance. For very expensive interactions, however,
* it may be better to set this to false.
*/
void
setUsePadding
(
bool
padding
);
/**
* Set the range of atom blocks and tiles that should be processed by this context.
*/
*/
void
set
TileRange
(
int
startTileIndex
,
int
numTiles
);
void
set
AtomBlockRange
(
double
startFraction
,
double
endFraction
);
/**
/**
* Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
* Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
* are assumed to be the same as those for the default interaction Kernel, since this kernel will use
* are assumed to be the same as those for the default interaction Kernel, since this kernel will use
...
@@ -232,42 +248,38 @@ public:
...
@@ -232,42 +248,38 @@ public:
* @param isSymmetric specifies whether the interaction is symmetric
* @param isSymmetric specifies whether the interaction is symmetric
*/
*/
CUfunction
createInteractionKernel
(
const
std
::
string
&
source
,
std
::
vector
<
ParameterInfo
>&
params
,
std
::
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
);
CUfunction
createInteractionKernel
(
const
std
::
string
&
source
,
std
::
vector
<
ParameterInfo
>&
params
,
std
::
vector
<
ParameterInfo
>&
arguments
,
bool
useExclusions
,
bool
isSymmetric
);
/**
* This is a utility routine for locating data in the exclusions array. It takes the (x,y) indices of a tile,
* and returns the location in the array where the data for that tile begins.
*
* This routine requires that x >= y. If not, it will throw an exception.
*
* @param x the x index of the tile
* @param y the y index of the tile
* @param exclusionIndices the content of the exclusionIndices array
* @param exclusionRowIndices the content of the exclusionRowIndices array
* @return the index in the exclusions array at which the data for that tile begins
*/
static
int
findExclusionIndex
(
int
x
,
int
y
,
const
std
::
vector
<
unsigned
int
>&
exclusionIndices
,
const
std
::
vector
<
unsigned
int
>&
exclusionRowIndices
);
private:
private:
class
BlockSortTrait
;
CudaContext
&
context
;
CudaContext
&
context
;
CUfunction
forceKernel
;
CUfunction
forceKernel
;
CUfunction
findBlockBoundsKernel
;
CUfunction
findBlockBoundsKernel
;
CUfunction
sortBoxDataKernel
;
CUfunction
findInteractingBlocksKernel
;
CUfunction
findInteractingBlocksKernel
;
CUfunction
findInteractionsWithinBlocksKernel
;
CUfunction
findInteractionsWithinBlocksKernel
;
CudaArray
*
exclusionTiles
;
CudaArray
*
exclusions
;
CudaArray
*
exclusions
;
CudaArray
*
exclusionIndices
;
CudaArray
*
exclusionIndices
;
CudaArray
*
exclusionRowIndices
;
CudaArray
*
exclusionRowIndices
;
CudaArray
*
interactingTiles
;
CudaArray
*
interactingTiles
;
CudaArray
*
interacti
onFlag
s
;
CudaArray
*
interacti
ngAtom
s
;
CudaArray
*
interactionCount
;
CudaArray
*
interactionCount
;
CudaArray
*
blockCenter
;
CudaArray
*
blockCenter
;
CudaArray
*
blockBoundingBox
;
CudaArray
*
blockBoundingBox
;
std
::
vector
<
void
*>
forceArgs
,
findBlockBoundsArgs
,
findInteractingBlocksArgs
,
findInteractionsWithinBlocksArgs
;
CudaArray
*
sortedBlocks
;
CudaArray
*
sortedBlockCenter
;
CudaArray
*
sortedBlockBoundingBox
;
CudaArray
*
oldPositions
;
CudaArray
*
rebuildNeighborList
;
CudaSort
*
blockSorter
;
std
::
vector
<
void
*>
forceArgs
,
findBlockBoundsArgs
,
sortBoxDataArgs
,
findInteractingBlocksArgs
;
std
::
vector
<
std
::
vector
<
int
>
>
atomExclusions
;
std
::
vector
<
std
::
vector
<
int
>
>
atomExclusions
;
std
::
vector
<
ParameterInfo
>
parameters
;
std
::
vector
<
ParameterInfo
>
parameters
;
std
::
vector
<
ParameterInfo
>
arguments
;
std
::
vector
<
ParameterInfo
>
arguments
;
std
::
string
kernelSource
;
std
::
string
kernelSource
;
std
::
map
<
std
::
string
,
std
::
string
>
kernelDefines
;
std
::
map
<
std
::
string
,
std
::
string
>
kernelDefines
;
double
cutoff
;
double
cutoff
;
bool
useCutoff
,
usePeriodic
,
anyExclusions
;
bool
useCutoff
,
usePeriodic
,
anyExclusions
,
usePadding
;
int
startTileIndex
,
numTiles
,
maxTiles
,
numForceThreadBlocks
,
forceThreadBlockSize
,
nonbondedForceGroup
,
numAtoms
;
int
startTileIndex
,
numTiles
,
startBlockIndex
,
numBlocks
,
maxTiles
,
numForceThreadBlocks
,
forceThreadBlockSize
,
nonbondedForceGroup
,
numAtoms
;
};
};
/**
/**
...
...
platforms/cuda/src/CudaParallelKernels.cpp
View file @
93c467b2
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2011-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2011-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -118,7 +118,7 @@ private:
...
@@ -118,7 +118,7 @@ private:
};
};
CudaParallelCalcForcesAndEnergyKernel
::
CudaParallelCalcForcesAndEnergyKernel
(
string
name
,
const
Platform
&
platform
,
CudaPlatform
::
PlatformData
&
data
)
:
CudaParallelCalcForcesAndEnergyKernel
::
CudaParallelCalcForcesAndEnergyKernel
(
string
name
,
const
Platform
&
platform
,
CudaPlatform
::
PlatformData
&
data
)
:
CalcForcesAndEnergyKernel
(
name
,
platform
),
data
(
data
),
completionTimes
(
data
.
contexts
.
size
()),
context
Tile
s
(
data
.
contexts
.
size
()),
contextForces
(
NULL
),
CalcForcesAndEnergyKernel
(
name
,
platform
),
data
(
data
),
completionTimes
(
data
.
contexts
.
size
()),
context
NonbondedFraction
s
(
data
.
contexts
.
size
()),
contextForces
(
NULL
),
pinnedPositionBuffer
(
NULL
),
pinnedForceBuffer
(
NULL
)
{
pinnedPositionBuffer
(
NULL
),
pinnedForceBuffer
(
NULL
)
{
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
kernels
.
push_back
(
Kernel
(
new
CudaCalcForcesAndEnergyKernel
(
name
,
platform
,
*
data
.
contexts
[
i
])));
kernels
.
push_back
(
Kernel
(
new
CudaCalcForcesAndEnergyKernel
(
name
,
platform
,
*
data
.
contexts
[
i
])));
...
@@ -141,6 +141,8 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
...
@@ -141,6 +141,8 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
sumKernel
=
cu
.
getKernel
(
module
,
"sumForces"
);
sumKernel
=
cu
.
getKernel
(
module
,
"sumForces"
);
for
(
int
i
=
0
;
i
<
(
int
)
kernels
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
(
int
)
kernels
.
size
();
i
++
)
getKernel
(
i
).
initialize
(
system
);
getKernel
(
i
).
initialize
(
system
);
for
(
int
i
=
0
;
i
<
(
int
)
contextNonbondedFractions
.
size
();
i
++
)
contextNonbondedFractions
[
i
]
=
1
/
(
double
)
contextNonbondedFractions
.
size
();
}
}
void
CudaParallelCalcForcesAndEnergyKernel
::
beginComputation
(
ContextImpl
&
context
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
)
{
void
CudaParallelCalcForcesAndEnergyKernel
::
beginComputation
(
ContextImpl
&
context
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
)
{
...
@@ -184,30 +186,26 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
...
@@ -184,30 +186,26 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
void
*
args
[]
=
{
&
cu
.
getForce
().
getDevicePointer
(),
&
contextForces
->
getDevicePointer
(),
&
bufferSize
,
&
numBuffers
};
void
*
args
[]
=
{
&
cu
.
getForce
().
getDevicePointer
(),
&
contextForces
->
getDevicePointer
(),
&
bufferSize
,
&
numBuffers
};
cu
.
executeKernel
(
sumKernel
,
args
,
bufferSize
);
cu
.
executeKernel
(
sumKernel
,
args
,
bufferSize
);
// Balance work between the contexts by transferring a
few
nonbonded
tiles
from the context that
// Balance work between the contexts by transferring a
little
nonbonded
work
from the context that
// finished last to the one that finished first.
// finished last to the one that finished first.
int
firstIndex
=
0
,
lastIndex
=
0
;
int
firstIndex
=
0
,
lastIndex
=
0
;
int
totalTiles
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
completionTimes
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
(
int
)
completionTimes
.
size
();
i
++
)
{
if
(
completionTimes
[
i
]
<
completionTimes
[
firstIndex
])
if
(
completionTimes
[
i
]
<
completionTimes
[
firstIndex
])
firstIndex
=
i
;
firstIndex
=
i
;
if
(
completionTimes
[
i
]
>
completionTimes
[
lastIndex
])
if
(
completionTimes
[
i
]
>
completionTimes
[
lastIndex
])
lastIndex
=
i
;
lastIndex
=
i
;
contextTiles
[
i
]
=
data
.
contexts
[
i
]
->
getNonbondedUtilities
().
getNumTiles
();
}
totalTiles
+=
contextTiles
[
i
];
double
fractionToTransfer
=
min
(
0.001
,
contextNonbondedFractions
[
lastIndex
]);
}
contextNonbondedFractions
[
firstIndex
]
+=
fractionToTransfer
;
int
tilesToTransfer
=
totalTiles
/
1000
;
contextNonbondedFractions
[
lastIndex
]
-=
fractionToTransfer
;
if
(
tilesToTransfer
<
1
)
double
startFraction
=
0.0
;
tilesToTransfer
=
1
;
for
(
int
i
=
0
;
i
<
(
int
)
contextNonbondedFractions
.
size
();
i
++
)
{
if
(
tilesToTransfer
>
contextTiles
[
lastIndex
])
double
endFraction
=
startFraction
+
contextNonbondedFractions
[
i
];
tilesToTransfer
=
contextTiles
[
lastIndex
];
if
(
i
==
contextNonbondedFractions
.
size
()
-
1
)
contextTiles
[
firstIndex
]
+=
tilesToTransfer
;
endFraction
=
1.0
;
// Avoid roundoff error
contextTiles
[
lastIndex
]
-=
tilesToTransfer
;
data
.
contexts
[
i
]
->
getNonbondedUtilities
().
setAtomBlockRange
(
startFraction
,
endFraction
);
int
startIndex
=
0
;
startFraction
=
endFraction
;
for
(
int
i
=
0
;
i
<
(
int
)
contextTiles
.
size
();
i
++
)
{
data
.
contexts
[
i
]
->
getNonbondedUtilities
().
setTileRange
(
startIndex
,
contextTiles
[
i
]);
startIndex
+=
contextTiles
[
i
];
}
}
}
}
return
energy
;
return
energy
;
...
...
platforms/cuda/src/CudaParallelKernels.h
View file @
93c467b2
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2011-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2011-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -80,7 +80,7 @@ private:
...
@@ -80,7 +80,7 @@ private:
CudaPlatform
::
PlatformData
&
data
;
CudaPlatform
::
PlatformData
&
data
;
std
::
vector
<
Kernel
>
kernels
;
std
::
vector
<
Kernel
>
kernels
;
std
::
vector
<
long
long
>
completionTimes
;
std
::
vector
<
long
long
>
completionTimes
;
std
::
vector
<
int
>
context
Tile
s
;
std
::
vector
<
double
>
context
NonbondedFraction
s
;
CudaArray
*
contextForces
;
CudaArray
*
contextForces
;
void
*
pinnedPositionBuffer
;
void
*
pinnedPositionBuffer
;
long
long
*
pinnedForceBuffer
;
long
long
*
pinnedForceBuffer
;
...
...
platforms/cuda/src/CudaSort.cpp
View file @
93c467b2
...
@@ -32,7 +32,7 @@ using namespace OpenMM;
...
@@ -32,7 +32,7 @@ using namespace OpenMM;
using
namespace
std
;
using
namespace
std
;
CudaSort
::
CudaSort
(
CudaContext
&
context
,
SortTrait
*
trait
,
unsigned
int
length
)
:
context
(
context
),
trait
(
trait
),
CudaSort
::
CudaSort
(
CudaContext
&
context
,
SortTrait
*
trait
,
unsigned
int
length
)
:
context
(
context
),
trait
(
trait
),
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
)
{
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
)
,
dataLength
(
length
)
{
// Create kernels.
// Create kernels.
map
<
string
,
string
>
replacements
;
map
<
string
,
string
>
replacements
;
...
@@ -43,6 +43,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
...
@@ -43,6 +43,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
replacements
[
"MAX_KEY"
]
=
trait
->
getMaxKey
();
replacements
[
"MAX_KEY"
]
=
trait
->
getMaxKey
();
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
CUmodule
module
=
context
.
createModule
(
context
.
replaceStrings
(
CudaKernelSources
::
sort
,
replacements
));
CUmodule
module
=
context
.
createModule
(
context
.
replaceStrings
(
CudaKernelSources
::
sort
,
replacements
));
shortListKernel
=
context
.
getKernel
(
module
,
"sortShortList"
);
computeRangeKernel
=
context
.
getKernel
(
module
,
"computeRange"
);
computeRangeKernel
=
context
.
getKernel
(
module
,
"computeRange"
);
assignElementsKernel
=
context
.
getKernel
(
module
,
"assignElementsToBuckets"
);
assignElementsKernel
=
context
.
getKernel
(
module
,
"assignElementsToBuckets"
);
computeBucketPositionsKernel
=
context
.
getKernel
(
module
,
"computeBucketPositions"
);
computeBucketPositionsKernel
=
context
.
getKernel
(
module
,
"computeBucketPositions"
);
...
@@ -53,15 +54,16 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
...
@@ -53,15 +54,16 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
int
maxBlockSize
;
int
maxBlockSize
;
cuDeviceGetAttribute
(
&
maxBlockSize
,
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X
,
context
.
getDevice
());
cuDeviceGetAttribute
(
&
maxBlockSize
,
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X
,
context
.
getDevice
());
int
maxSharedMem
;
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
isShortList
=
(
length
<=
maxLocalBuffer
);
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxBlockSize
;
rangeKernelSize
*=
2
)
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxBlockSize
;
rangeKernelSize
*=
2
)
;
;
positionsKernelSize
=
rangeKernelSize
;
positionsKernelSize
=
rangeKernelSize
;
sortKernelSize
=
rangeKernelSize
/
2
;
sortKernelSize
=
(
isShortList
?
rangeKernelSize
/
2
:
rangeKernelSize
/
4
)
;
if
(
rangeKernelSize
>
length
)
if
(
rangeKernelSize
>
length
)
rangeKernelSize
=
length
;
rangeKernelSize
=
length
;
int
maxSharedMem
;
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
if
(
sortKernelSize
>
maxLocalBuffer
)
if
(
sortKernelSize
>
maxLocalBuffer
)
sortKernelSize
=
maxLocalBuffer
;
sortKernelSize
=
maxLocalBuffer
;
unsigned
int
targetBucketSize
=
sortKernelSize
/
2
;
unsigned
int
targetBucketSize
=
sortKernelSize
/
2
;
...
@@ -73,11 +75,13 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
...
@@ -73,11 +75,13 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
// Create workspace arrays.
// Create workspace arrays.
if
(
!
isShortList
)
{
dataRange
=
new
CudaArray
(
context
,
2
,
trait
->
getKeySize
(),
"sortDataRange"
);
dataRange
=
new
CudaArray
(
context
,
2
,
trait
->
getKeySize
(),
"sortDataRange"
);
bucketOffset
=
CudaArray
::
create
<
uint1
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOffset
=
CudaArray
::
create
<
uint1
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOfElement
=
CudaArray
::
create
<
uint1
>
(
context
,
length
,
"bucketOfElement"
);
bucketOfElement
=
CudaArray
::
create
<
uint1
>
(
context
,
length
,
"bucketOfElement"
);
offsetInBucket
=
CudaArray
::
create
<
uint1
>
(
context
,
length
,
"offsetInBucket"
);
offsetInBucket
=
CudaArray
::
create
<
uint1
>
(
context
,
length
,
"offsetInBucket"
);
buckets
=
new
CudaArray
(
context
,
length
,
trait
->
getDataSize
(),
"buckets"
);
buckets
=
new
CudaArray
(
context
,
length
,
trait
->
getDataSize
(),
"buckets"
);
}
}
}
CudaSort
::~
CudaSort
()
{
CudaSort
::~
CudaSort
()
{
...
@@ -95,22 +99,27 @@ CudaSort::~CudaSort() {
...
@@ -95,22 +99,27 @@ CudaSort::~CudaSort() {
}
}
void
CudaSort
::
sort
(
CudaArray
&
data
)
{
void
CudaSort
::
sort
(
CudaArray
&
data
)
{
if
(
data
.
getSize
()
!=
bucketOfElement
->
getSize
()
||
data
.
getElementSize
()
!=
trait
->
getDataSize
())
if
(
data
.
getSize
()
!=
dataLength
||
data
.
getElementSize
()
!=
trait
->
getDataSize
())
throw
OpenMMException
(
"CudaSort called with different data size"
);
throw
OpenMMException
(
"CudaSort called with different data size"
);
if
(
data
.
getSize
()
==
0
)
if
(
data
.
getSize
()
==
0
)
return
;
return
;
if
(
isShortList
)
{
// We can use a simpler sort kernel that does the entire operation at once in local memory.
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
};
context
.
executeKernel
(
shortListKernel
,
sortArgs
,
sortKernelSize
,
sortKernelSize
,
dataLength
*
trait
->
getDataSize
());
}
else
{
// Compute the range of data values.
// Compute the range of data values.
unsigned
int
dataSize
=
data
.
getSize
();
void
*
rangeArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
,
&
dataRange
->
getDevicePointer
()};
void
*
rangeArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataSize
,
&
dataRange
->
getDevicePointer
()};
context
.
executeKernel
(
computeRangeKernel
,
rangeArgs
,
rangeKernelSize
,
rangeKernelSize
,
rangeKernelSize
*
trait
->
getKeySize
());
context
.
executeKernel
(
computeRangeKernel
,
rangeArgs
,
rangeKernelSize
,
rangeKernelSize
,
rangeKernelSize
*
trait
->
getKeySize
());
// Assign array elements to buckets.
// Assign array elements to buckets.
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
context
.
clearBuffer
(
*
bucketOffset
);
context
.
clearBuffer
(
*
bucketOffset
);
void
*
elementsArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
data
Size
,
&
numBuckets
,
&
dataRange
->
getDevicePointer
(),
void
*
elementsArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
data
Length
,
&
numBuckets
,
&
dataRange
->
getDevicePointer
(),
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
assignElementsKernel
,
elementsArgs
,
data
.
getSize
());
context
.
executeKernel
(
assignElementsKernel
,
elementsArgs
,
data
.
getSize
());
...
@@ -121,7 +130,7 @@ void CudaSort::sort(CudaArray& data) {
...
@@ -121,7 +130,7 @@ void CudaSort::sort(CudaArray& data) {
// Copy the data into the buckets.
// Copy the data into the buckets.
void
*
copyArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
data
Size
,
&
bucketOffset
->
getDevicePointer
(),
void
*
copyArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
data
Length
,
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
copyToBucketsKernel
,
copyArgs
,
data
.
getSize
());
context
.
executeKernel
(
copyToBucketsKernel
,
copyArgs
,
data
.
getSize
());
...
@@ -129,4 +138,5 @@ void CudaSort::sort(CudaArray& data) {
...
@@ -129,4 +138,5 @@ void CudaSort::sort(CudaArray& data) {
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
context
.
executeKernel
(
sortBucketsKernel
,
sortArgs
,
((
data
.
getSize
()
+
sortKernelSize
-
1
)
/
sortKernelSize
)
*
sortKernelSize
,
sortKernelSize
,
sortKernelSize
*
trait
->
getDataSize
());
context
.
executeKernel
(
sortBucketsKernel
,
sortArgs
,
((
data
.
getSize
()
+
sortKernelSize
-
1
)
/
sortKernelSize
)
*
sortKernelSize
,
sortKernelSize
,
sortKernelSize
*
trait
->
getDataSize
());
}
}
}
platforms/cuda/src/CudaSort.h
View file @
93c467b2
...
@@ -92,8 +92,9 @@ private:
...
@@ -92,8 +92,9 @@ private:
CudaArray
*
offsetInBucket
;
CudaArray
*
offsetInBucket
;
CudaArray
*
bucketOffset
;
CudaArray
*
bucketOffset
;
CudaArray
*
buckets
;
CudaArray
*
buckets
;
CUfunction
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
CUfunction
shortListKernel
,
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
unsigned
int
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
unsigned
int
dataLength
,
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
bool
isShortList
;
};
};
/**
/**
...
...
platforms/cuda/src/kernels/coulombLennardJones.cu
View file @
93c467b2
#if USE_EWALD
#if USE_EWALD
bool
needCorrection
=
isExcluded
&&
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
;
bool
needCorrection
=
hasExclusions
&&
isExcluded
&&
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
;
if
(
!
isExcluded
||
needCorrection
)
{
if
(
!
isExcluded
||
needCorrection
)
{
real
tempForce
=
0.0
f
;
if
(
r2
<
CUTOFF_SQUARED
||
needCorrection
)
{
if
(
r2
<
CUTOFF_SQUARED
||
needCorrection
)
{
const
real
alphaR
=
EWALD_ALPHA
*
r
;
const
real
alphaR
=
EWALD_ALPHA
*
r
;
const
real
expAlphaRSqr
=
EXP
(
-
alphaR
*
alphaR
);
const
real
expAlphaRSqr
=
EXP
(
-
alphaR
*
alphaR
);
...
@@ -16,6 +15,7 @@ if (!isExcluded || needCorrection) {
...
@@ -16,6 +15,7 @@ if (!isExcluded || needCorrection) {
t
*=
t
;
t
*=
t
;
t
*=
t
;
t
*=
t
;
const
real
erfcAlphaR
=
RECIP
(
t
*
t
);
const
real
erfcAlphaR
=
RECIP
(
t
*
t
);
real
tempForce
=
0.0
f
;
if
(
needCorrection
)
{
if
(
needCorrection
)
{
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
...
@@ -36,8 +36,8 @@ if (!isExcluded || needCorrection) {
...
@@ -36,8 +36,8 @@ if (!isExcluded || needCorrection) {
tempEnergy
+=
prefactor
*
erfcAlphaR
;
tempEnergy
+=
prefactor
*
erfcAlphaR
;
#endif
#endif
}
}
}
dEdR
+=
tempForce
*
invR
*
invR
;
dEdR
+=
tempForce
*
invR
*
invR
;
}
}
}
#else
#else
{
{
...
...
platforms/cuda/src/kernels/customGBEnergyN2.cu
View file @
93c467b2
#define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0x100000000)));
#define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0x100000000)));
#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0x100000000)));
#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0x100000000)));
#define TILE_SIZE 32
typedef
struct
{
typedef
struct
{
real4
posq
;
real4
posq
;
...
@@ -15,88 +14,43 @@ typedef struct {
...
@@ -15,88 +14,43 @@ typedef struct {
* Compute a force based on pair interactions.
* Compute a force based on pair interactions.
*/
*/
extern
"C"
__global__
void
computeN2Energy
(
unsigned
long
long
*
__restrict__
forceBuffers
,
real
*
__restrict__
energyBuffer
,
extern
"C"
__global__
void
computeN2Energy
(
unsigned
long
long
*
__restrict__
forceBuffers
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
unsigned
int
*
__restrict__
exclusionIndices
,
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
ushort2
*
__restrict__
exclusionTiles
,
const
unsigned
int
*
__restrict__
exclusionRowIndices
,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interacti
onFlag
s
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
unsigned
int
*
__restrict__
interacti
ngAtom
s
#else
#else
unsigned
int
numTiles
unsigned
int
numTiles
#endif
#endif
PARAMETER_ARGUMENTS
)
{
PARAMETER_ARGUMENTS
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
#ifdef USE_CUTOFF
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
unsigned
int
numTiles
=
interactionCount
[
0
];
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
unsigned
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
unsigned
int
pos
=
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
real
energy
=
0
;
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
do
{
// First loop: process tiles that contain exclusions.
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
unsigned
int
x
,
y
;
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
real3
force
=
make_real3
(
0
);
real3
force
=
make_real3
(
0
);
DECLARE_ATOM1_DERIVATIVES
DECLARE_ATOM1_DERIVATIVES
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
}
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
if
(
tgx
<
2
)
unsigned
int
excl
=
exclusions
[
pos
*
TILE_SIZE
+
tgx
];
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
bool
hasExclusions
=
(
exclusionIndex
[
localGroupIndex
]
>
-
1
);
#else
bool
hasExclusions
=
false
;
#endif
#endif
if
(
pos
>=
end
)
if
(
x
==
y
)
{
;
// This warp is done.
else
if
(
x
==
y
)
{
// This tile is on the diagonal.
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
localData
[
localAtomIndex
].
posq
=
posq1
;
localData
[
localAtomIndex
].
posq
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
exclusions
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
#endif
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
int
atom2
=
tbx
+
j
;
int
atom2
=
tbx
+
j
;
real4
posq2
=
localData
[
atom2
].
posq
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
...
@@ -115,6 +69,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
...
@@ -115,6 +69,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
atom2
=
y
*
TILE_SIZE
+
j
;
atom2
=
y
*
TILE_SIZE
+
j
;
real
dEdR
=
0
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
real
tempEnergy
=
0
;
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
COMPUTE_INTERACTION
COMPUTE_INTERACTION
dEdR
/=
-
r
;
dEdR
/=
-
r
;
...
@@ -136,32 +93,16 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
...
@@ -136,32 +93,16 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
// This is an off-diagonal tile.
// This is an off-diagonal tile.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData
[
localAtomIndex
].
force
=
make_real3
(
0
);
localData
[
localAtomIndex
].
force
=
make_real3
(
0
);
CLEAR_LOCAL_DERIVATIVES
CLEAR_LOCAL_DERIVATIVES
#ifdef USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
if
(
!
hasExclusions
&&
flags
==
0
)
{
// No interactions in this tile.
}
else
#endif
{
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
0xFFFFFFFF
);
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
));
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
));
#endif
#endif
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
int
atom2
=
tbx
+
tj
;
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
...
@@ -180,6 +121,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
...
@@ -180,6 +121,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
atom2
=
y
*
TILE_SIZE
+
tj
;
atom2
=
y
*
TILE_SIZE
+
tj
;
real
dEdR
=
0
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
real
tempEnergy
=
0
;
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
COMPUTE_INTERACTION
dEdR
/=
-
r
;
dEdR
/=
-
r
;
...
@@ -203,27 +147,216 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
...
@@ -203,27 +147,216 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
}
}
lasty
=
y
;
// Write results.
// Write results.
if
(
pos
<
end
)
{
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
STORE_DERIVATIVES_1
STORE_DERIVATIVES_1
}
if
(
x
!=
y
)
{
if
(
pos
<
end
&&
x
!=
y
)
{
offset
=
y
*
TILE_SIZE
+
tgx
;
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
STORE_DERIVATIVES_2
STORE_DERIVATIVES_2
}
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
int
pos
=
warp
*
numTiles
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
__shared__
int
skipTiles
[
THREAD_BLOCK_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
while
(
pos
<
end
)
{
const
bool
isExcluded
=
false
;
real3
force
=
make_real3
(
0
);
DECLARE_ATOM1_DERIVATIVES
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
unsigned
int
x
,
y
;
bool
singlePeriodicCopy
=
false
;
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
singlePeriodicCopy
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
// Skip over tiles that have exclusions, since they were already processed.
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
force
=
make_real3
(
0
);
CLEAR_LOCAL_DERIVATIVES
}
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
localData
[
threadIdx
.
x
].
posq
.
x
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
localData
[
threadIdx
.
x
].
posq
.
y
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
localData
[
threadIdx
.
x
].
posq
.
z
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices
[
tbx
+
tj
];
real
dEdR
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-
r
;
}
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
atom2
=
tbx
+
tj
;
localData
[
atom2
].
force
.
x
+=
delta
.
x
;
localData
[
atom2
].
force
.
y
+=
delta
.
y
;
localData
[
atom2
].
force
.
z
+=
delta
.
z
;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices
[
tbx
+
tj
];
real
dEdR
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-
r
;
}
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
atom2
=
tbx
+
tj
;
localData
[
atom2
].
force
.
x
+=
delta
.
x
;
localData
[
atom2
].
force
.
y
+=
delta
.
y
;
localData
[
atom2
].
force
.
z
+=
delta
.
z
;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
// Write results.
atomicAdd
(
&
forceBuffers
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom1
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0x100000000
)));
unsigned
int
offset
=
atom1
;
STORE_DERIVATIVES_1
#ifdef USE_CUTOFF
unsigned
int
atom2
=
atomIndices
[
threadIdx
.
x
];
#else
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
#endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
atomicAdd
(
&
forceBuffers
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
x
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
y
*
0x100000000
)));
atomicAdd
(
&
forceBuffers
[
atom2
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
force
.
z
*
0x100000000
)));
offset
=
atom2
;
STORE_DERIVATIVES_2
}
}
pos
++
;
pos
++
;
}
while
(
pos
<
end
);
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
}
platforms/cuda/src/kernels/customGBValueN2.cu
View file @
93c467b2
#define TILE_SIZE 32
typedef
struct
{
typedef
struct
{
real4
posq
;
real4
posq
;
real
value
,
temp
;
real
value
,
temp
;
...
@@ -13,86 +11,41 @@ typedef struct {
...
@@ -13,86 +11,41 @@ typedef struct {
* Compute a value based on pair interactions.
* Compute a value based on pair interactions.
*/
*/
extern
"C"
__global__
void
computeN2Value
(
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
extern
"C"
__global__
void
computeN2Value
(
const
real4
*
__restrict__
posq
,
const
unsigned
int
*
__restrict__
exclusions
,
const
u
nsigned
int
*
__restrict__
exclusion
Indices
,
const
unsigned
int
*
__restrict__
exclusionRowIndic
es
,
unsigned
long
long
*
__restrict__
global_value
,
const
u
short2
*
__restrict__
exclusion
Til
es
,
unsigned
long
long
*
__restrict__
global_value
,
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interacti
onFlag
s
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
real4
*
__restrict__
blockCenter
,
const
unsigned
int
*
__restrict__
interacti
ngAtom
s
#else
#else
unsigned
int
numTiles
unsigned
int
numTiles
#endif
#endif
PARAMETER_ARGUMENTS
)
{
PARAMETER_ARGUMENTS
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
const
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
const
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
unsigned
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
unsigned
int
pos
=
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
do
{
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
__shared__
AtomData
localData
[
THREAD_BLOCK_SIZE
];
unsigned
int
x
,
y
;
// First loop: process tiles that contain exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE
+
warp
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE
+
(
warp
+
1
)
*
(
LAST_EXCLUSION_TILE
-
FIRST_EXCLUSION_TILE
)
/
totalWarps
;
for
(
int
pos
=
firstExclusionTile
;
pos
<
lastExclusionTile
;
pos
++
)
{
const
ushort2
tileIndices
=
exclusionTiles
[
pos
];
const
unsigned
int
x
=
tileIndices
.
x
;
const
unsigned
int
y
=
tileIndices
.
y
;
real
value
=
0
;
real
value
=
0
;
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
}
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
if
(
tgx
<
2
)
unsigned
int
excl
=
exclusions
[
pos
*
TILE_SIZE
+
tgx
];
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
bool
hasExclusions
=
(
exclusionIndex
[
localGroupIndex
]
>
-
1
);
#else
bool
hasExclusions
=
false
;
#endif
#endif
if
(
pos
>=
end
)
if
(
x
==
y
)
{
;
// This warp is done.
else
if
(
x
==
y
)
{
// This tile is on the diagonal.
// This tile is on the diagonal.
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
localData
[
localAtomIndex
].
posq
=
posq1
;
localData
[
localAtomIndex
].
posq
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned
int
excl
=
exclusions
[
exclusionIndex
[
localGroupIndex
]
+
tgx
];
#endif
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
);
#endif
int
atom2
=
tbx
+
j
;
int
atom2
=
tbx
+
j
;
real4
posq2
=
localData
[
atom2
].
posq
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
...
@@ -112,7 +65,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
...
@@ -112,7 +65,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
real
tempValue1
=
0
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
real
tempValue2
=
0
;
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
if
(
!
isExcluded
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
||
atom2
>=
NUM_ATOMS
||
!
(
excl
&
0x1
));
if
(
!
isExcluded
&&
atom1
!=
atom2
)
{
#else
#else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
#endif
#endif
...
@@ -130,25 +84,17 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
...
@@ -130,25 +84,17 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
else
{
else
{
// This is an off-diagonal tile.
// This is an off-diagonal tile.
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
localData
[
threadIdx
.
x
].
posq
=
posq
[
j
];
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData
[
localAtomIndex
].
value
=
0
;
localData
[
threadIdx
.
x
].
value
=
0
;
#ifdef USE_EXCLUSIONS
#ifdef USE_CUTOFF
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
));
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
#endif
if
(
!
hasExclusions
&&
flags
!=
0xFFFFFFFF
)
{
unsigned
int
tj
=
tgx
;
if
(
flags
==
0
)
{
for
(
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
// No interactions in this tile.
int
atom2
=
tbx
+
tj
;
}
else
{
// Compute only a subset of the interactions in this tile.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
int
atom2
=
tbx
+
j
;
real4
posq2
=
localData
[
atom2
].
posq
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
...
@@ -157,44 +103,162 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
...
@@ -157,44 +103,162 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
real
tempValue1
=
0
;
#ifdef USE_CUTOFF
real
tempValue2
=
0
;
if
(
r2
<
CUTOFF_SQUARED
)
{
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
j
;
atom2
=
y
*
TILE_SIZE
+
tj
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#ifdef USE_EXCLUSIONS
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
||
atom2
>=
NUM_ATOMS
||
!
(
excl
&
0x1
));
if
(
!
isExcluded
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#endif
COMPUTE_VALUE
COMPUTE_VALUE
}
}
value
+=
tempValue1
;
value
+=
tempValue1
;
localData
[
tbx
+
tj
].
value
+=
tempValue2
;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
localData
[
threadIdx
.
x
].
temp
=
tempValue2
;
// Sum the forces on atom2
.
// Write results
.
if
(
tgx
%
4
==
0
)
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
localData
[
threadIdx
.
x
].
temp
+=
lo
c
al
Data
[
threadIdx
.
x
+
1
].
temp
+
localData
[
threadIdx
.
x
+
2
].
temp
+
localData
[
threadIdx
.
x
+
3
].
temp
;
atomicAdd
(
&
g
lo
b
al
_value
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
value
*
0x100000000
)))
;
if
(
tg
x
=
=
0
)
if
(
x
!
=
y
)
{
localData
[
tbx
+
j
].
value
+=
localData
[
threadIdx
.
x
].
temp
+
localData
[
threadIdx
.
x
+
4
].
temp
+
localData
[
threadIdx
.
x
+
8
].
temp
+
localData
[
threadIdx
.
x
+
12
].
temp
+
localData
[
threadIdx
.
x
+
16
].
temp
+
localData
[
threadIdx
.
x
+
20
].
temp
+
localData
[
threadIdx
.
x
+
24
].
temp
+
localData
[
threadIdx
.
x
+
28
].
temp
;
offset
=
y
*
TILE_SIZE
+
tgx
;
}
atomicAdd
(
&
global_value
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
value
*
0x100000000
)));
}
}
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
int
pos
=
warp
*
numTiles
/
totalWarps
;
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__shared__
int
atomIndices
[
THREAD_BLOCK_SIZE
];
__shared__
int
skipTiles
[
THREAD_BLOCK_SIZE
];
skipTiles
[
threadIdx
.
x
]
=
-
1
;
while
(
pos
<
end
)
{
real
value
=
0
;
bool
includeTile
=
true
;
// Extract the coordinates of this tile.
unsigned
int
x
,
y
;
bool
singlePeriodicCopy
=
false
;
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
singlePeriodicCopy
=
tileIndices
.
y
;
}
}
else
else
#endif
#endif
{
{
// Compute the full set of interactions in this tile.
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
SQRT
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
#ifdef USE_EXCLUSIONS
// Skip over tiles that have exclusions, since they were already processed.
unsigned
int
excl
=
(
hasExclusions
?
exclusions
[
exclusionIndex
[
localGroupIndex
]
+
tgx
]
:
0xFFFFFFFF
);
excl
=
(
excl
>>
tgx
)
|
(
excl
<<
(
TILE_SIZE
-
tgx
));
while
(
skipTiles
[
tbx
+
TILE_SIZE
-
1
]
<
pos
)
{
if
(
skipBase
+
tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles
[
skipBase
+
tgx
];
skipTiles
[
threadIdx
.
x
]
=
tile
.
x
+
tile
.
y
*
NUM_BLOCKS
-
tile
.
y
*
(
tile
.
y
+
1
)
/
2
;
}
else
skipTiles
[
threadIdx
.
x
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
}
while
(
skipTiles
[
currentSkipIndex
]
<
pos
)
currentSkipIndex
++
;
includeTile
=
(
skipTiles
[
currentSkipIndex
]
!=
pos
);
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
// Load atom data for this tile.
real4
posq1
=
posq
[
atom1
];
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
threadIdx
.
x
;
#ifdef USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms
[
pos
*
TILE_SIZE
+
tgx
]
:
y
*
TILE_SIZE
+
tgx
);
#else
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
#endif
#endif
atomIndices
[
threadIdx
.
x
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
localData
[
localAtomIndex
].
posq
=
posq
[
j
];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData
[
localAtomIndex
].
value
=
0
;
}
#ifdef USE_PERIODIC
if
(
singlePeriodicCopy
)
{
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4
blockCenterX
=
blockCenter
[
x
];
posq1
.
x
-=
floor
((
posq1
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
posq1
.
y
-=
floor
((
posq1
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
posq1
.
z
-=
floor
((
posq1
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
localData
[
threadIdx
.
x
].
posq
.
x
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
x
-
blockCenterX
.
x
)
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
localData
[
threadIdx
.
x
].
posq
.
y
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
y
-
blockCenterX
.
y
)
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
localData
[
threadIdx
.
x
].
posq
.
z
-=
floor
((
localData
[
threadIdx
.
x
].
posq
.
z
-
blockCenterX
.
z
)
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
unsigned
int
tj
=
tgx
;
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
#ifdef USE_EXCLUSIONS
int
atom2
=
tbx
+
tj
;
bool
isExcluded
=
!
(
excl
&
0x1
);
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices
[
tbx
+
tj
];
real
tempValue1
=
0
;
real
tempValue2
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_VALUE
}
value
+=
tempValue1
;
localData
[
tbx
+
tj
].
value
+=
tempValue2
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
else
#endif
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
int
atom2
=
tbx
+
tj
;
int
atom2
=
tbx
+
tj
;
real4
posq2
=
localData
[
atom2
].
posq
;
real4
posq2
=
localData
[
atom2
].
posq
;
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
...
@@ -210,41 +274,32 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
...
@@ -210,41 +274,32 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
real
invR
=
RSQRT
(
r2
);
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
real
r
=
RECIP
(
invR
);
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y
*
TILE_SIZE
+
tj
;
atom2
=
atomIndices
[
tbx
+
tj
]
;
real
tempValue1
=
0
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
real
tempValue2
=
0
;
#ifdef USE_EXCLUSIONS
if
(
!
isExcluded
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#endif
COMPUTE_VALUE
COMPUTE_VALUE
}
}
value
+=
tempValue1
;
value
+=
tempValue1
;
localData
[
tbx
+
tj
].
value
+=
tempValue2
;
localData
[
tbx
+
tj
].
value
+=
tempValue2
;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
}
}
#endif
#ifdef USE_EXCLUSIONS
excl
>>=
1
;
#endif
#endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
}
}
// Write results.
// Write results.
if
(
pos
<
end
)
{
atomicAdd
(
&
global_value
[
atom1
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
value
*
0x100000000
)));
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
#ifdef USE_CUTOFF
atomicAdd
(
&
global_value
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
value
*
0x100000000
)));
unsigned
int
atom2
=
atomIndices
[
threadIdx
.
x
];
}
#else
if
(
pos
<
end
&&
x
!=
y
)
{
unsigned
int
atom2
=
y
*
TILE_SIZE
+
tgx
;
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
#endif
atomicAdd
(
&
global_value
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
value
*
0x100000000
)));
if
(
atom2
<
PADDED_NUM_ATOMS
)
atomicAdd
(
&
global_value
[
atom2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
value
*
0x100000000
)));
}
}
lasty
=
y
;
pos
++
;
pos
++
;
}
while
(
pos
<
end
);
}
}
}
platforms/cuda/src/kernels/customHbondForce.cu
View file @
93c467b2
...
@@ -48,12 +48,12 @@ inline __device__ real computeAngle(real4 vec1, real4 vec2) {
...
@@ -48,12 +48,12 @@ inline __device__ real computeAngle(real4 vec1, real4 vec2) {
real3
crossProduct
=
cross
(
vec1
,
vec2
);
real3
crossProduct
=
cross
(
vec1
,
vec2
);
real
scale
=
vec1
.
w
*
vec2
.
w
;
real
scale
=
vec1
.
w
*
vec2
.
w
;
angle
=
asin
(
SQRT
(
dot
(
crossProduct
,
crossProduct
)
/
scale
));
angle
=
ASIN
(
SQRT
(
dot
(
crossProduct
,
crossProduct
)
/
scale
));
if
(
cosine
<
0.0
f
)
if
(
cosine
<
0.0
f
)
angle
=
M_PI
-
angle
;
angle
=
M_PI
-
angle
;
}
}
else
else
angle
=
acos
(
cosine
);
angle
=
ACOS
(
cosine
);
return
angle
;
return
angle
;
}
}
...
...
platforms/cuda/src/kernels/ewald.cu
View file @
93c467b2
...
@@ -35,11 +35,11 @@ extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuf
...
@@ -35,11 +35,11 @@ extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuf
for
(
int
atom
=
0
;
atom
<
NUM_ATOMS
;
atom
++
)
{
for
(
int
atom
=
0
;
atom
<
NUM_ATOMS
;
atom
++
)
{
real4
apos
=
posq
[
atom
];
real4
apos
=
posq
[
atom
];
real
phase
=
apos
.
x
*
kx
;
real
phase
=
apos
.
x
*
kx
;
real2
structureFactor
=
make_real2
(
cos
(
phase
),
sin
(
phase
));
real2
structureFactor
=
make_real2
(
COS
(
phase
),
SIN
(
phase
));
phase
=
apos
.
y
*
ky
;
phase
=
apos
.
y
*
ky
;
structureFactor
=
multofReal2
(
structureFactor
,
make_real2
(
cos
(
phase
),
sin
(
phase
)));
structureFactor
=
multofReal2
(
structureFactor
,
make_real2
(
COS
(
phase
),
SIN
(
phase
)));
phase
=
apos
.
z
*
kz
;
phase
=
apos
.
z
*
kz
;
structureFactor
=
multofReal2
(
structureFactor
,
make_real2
(
cos
(
phase
),
sin
(
phase
)));
structureFactor
=
multofReal2
(
structureFactor
,
make_real2
(
COS
(
phase
),
SIN
(
phase
)));
sum
+=
apos
.
w
*
structureFactor
;
sum
+=
apos
.
w
*
structureFactor
;
}
}
cosSinSum
[
index
]
=
sum
;
cosSinSum
[
index
]
=
sum
;
...
@@ -76,9 +76,9 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
...
@@ -76,9 +76,9 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
for
(
int
ry
=
lowry
;
ry
<
KMAX_Y
;
ry
++
)
{
for
(
int
ry
=
lowry
;
ry
<
KMAX_Y
;
ry
++
)
{
real
ky
=
ry
*
reciprocalBoxSize
.
y
;
real
ky
=
ry
*
reciprocalBoxSize
.
y
;
real
phase
=
apos
.
x
*
kx
;
real
phase
=
apos
.
x
*
kx
;
real2
tab_xy
=
make_real2
(
cos
(
phase
),
sin
(
phase
));
real2
tab_xy
=
make_real2
(
COS
(
phase
),
SIN
(
phase
));
phase
=
apos
.
y
*
ky
;
phase
=
apos
.
y
*
ky
;
tab_xy
=
multofReal2
(
tab_xy
,
make_real2
(
cos
(
phase
),
sin
(
phase
)));
tab_xy
=
multofReal2
(
tab_xy
,
make_real2
(
COS
(
phase
),
SIN
(
phase
)));
for
(
int
rz
=
lowrz
;
rz
<
KMAX_Z
;
rz
++
)
{
for
(
int
rz
=
lowrz
;
rz
<
KMAX_Z
;
rz
++
)
{
real
kz
=
rz
*
reciprocalBoxSize
.
z
;
real
kz
=
rz
*
reciprocalBoxSize
.
z
;
...
@@ -88,7 +88,7 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
...
@@ -88,7 +88,7 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
real
k2
=
kx
*
kx
+
ky
*
ky
+
kz
*
kz
;
real
k2
=
kx
*
kx
+
ky
*
ky
+
kz
*
kz
;
real
ak
=
EXP
(
k2
*
EXP_COEFFICIENT
)
/
k2
;
real
ak
=
EXP
(
k2
*
EXP_COEFFICIENT
)
/
k2
;
phase
=
apos
.
z
*
kz
;
phase
=
apos
.
z
*
kz
;
real2
structureFactor
=
multofReal2
(
tab_xy
,
make_real2
(
cos
(
phase
),
sin
(
phase
)));
real2
structureFactor
=
multofReal2
(
tab_xy
,
make_real2
(
COS
(
phase
),
SIN
(
phase
)));
real2
sum
=
cosSinSum
[
index
];
real2
sum
=
cosSinSum
[
index
];
real
dEdR
=
2
*
reciprocalCoefficient
*
ak
*
apos
.
w
*
(
sum
.
x
*
structureFactor
.
y
-
sum
.
y
*
structureFactor
.
x
);
real
dEdR
=
2
*
reciprocalCoefficient
*
ak
*
apos
.
w
*
(
sum
.
x
*
structureFactor
.
y
-
sum
.
y
*
structureFactor
.
x
);
force
.
x
+=
dEdR
*
kx
;
force
.
x
+=
dEdR
*
kx
;
...
...
platforms/cuda/src/kernels/findInteractingBlocks.cu
View file @
93c467b2
This diff is collapsed.
Click to expand it.
platforms/cuda/src/kernels/gbsaObc1.cu
View file @
93c467b2
This diff is collapsed.
Click to expand it.
platforms/cuda/src/kernels/integrationUtilities.cu
View file @
93c467b2
...
@@ -24,14 +24,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
...
@@ -24,14 +24,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
state
.
y
^=
state
.
y
<<
5
;
x1
=
sqrt
(
-
2.0
f
*
log
(
x1
));
x1
=
SQRT
(
-
2.0
f
*
LOG
(
x1
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
state
.
w
=
m
;
carry
=
k
>>
30
;
carry
=
k
>>
30
;
float
x2
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
float
x2
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
value
.
x
=
x1
*
cos
(
2.0
f
*
3.14159265
f
*
x2
);
value
.
x
=
x1
*
COS
(
2.0
f
*
3.14159265
f
*
x2
);
// Generate second value.
// Generate second value.
...
@@ -49,14 +49,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
...
@@ -49,14 +49,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
state
.
y
^=
state
.
y
<<
5
;
x3
=
sqrt
(
-
2.0
f
*
log
(
x3
));
x3
=
SQRT
(
-
2.0
f
*
LOG
(
x3
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
state
.
w
=
m
;
carry
=
k
>>
30
;
carry
=
k
>>
30
;
float
x4
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
float
x4
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
value
.
y
=
x3
*
cos
(
2.0
f
*
3.14159265
f
*
x4
);
value
.
y
=
x3
*
COS
(
2.0
f
*
3.14159265
f
*
x4
);
// Generate third value.
// Generate third value.
...
@@ -74,14 +74,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
...
@@ -74,14 +74,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
state
.
y
^=
state
.
y
<<
5
;
x5
=
sqrt
(
-
2.0
f
*
log
(
x5
));
x5
=
SQRT
(
-
2.0
f
*
LOG
(
x5
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
state
.
w
=
m
;
carry
=
k
>>
30
;
carry
=
k
>>
30
;
float
x6
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
float
x6
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
value
.
z
=
x5
*
cos
(
2.0
f
*
3.14159265
f
*
x6
);
value
.
z
=
x5
*
COS
(
2.0
f
*
3.14159265
f
*
x6
);
// Generate fourth value.
// Generate fourth value.
...
@@ -99,14 +99,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
...
@@ -99,14 +99,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
<<
13
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
>>
17
;
state
.
y
^=
state
.
y
<<
5
;
state
.
y
^=
state
.
y
<<
5
;
x7
=
sqrt
(
-
2.0
f
*
log
(
x7
));
x7
=
SQRT
(
-
2.0
f
*
LOG
(
x7
));
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
k
=
(
state
.
z
>>
2
)
+
(
state
.
w
>>
3
)
+
(
carry
>>
2
);
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
m
=
state
.
w
+
state
.
w
+
state
.
z
+
carry
;
state
.
z
=
state
.
w
;
state
.
z
=
state
.
w
;
state
.
w
=
m
;
state
.
w
=
m
;
carry
=
k
>>
30
;
carry
=
k
>>
30
;
float
x8
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
float
x8
=
(
float
)(
state
.
x
+
state
.
y
+
state
.
w
)
/
(
float
)
0xffffffff
;
value
.
w
=
x7
*
cos
(
2.0
f
*
3.14159265
f
*
x8
);
value
.
w
=
x7
*
COS
(
2.0
f
*
3.14159265
f
*
x8
);
// Record the values.
// Record the values.
...
@@ -412,9 +412,9 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
...
@@ -412,9 +412,9 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed
yaksYd
=
zaksZd
*
xaksXd
-
xaksZd
*
zaksXd
;
mixed
yaksYd
=
zaksZd
*
xaksXd
-
xaksZd
*
zaksXd
;
mixed
zaksYd
=
xaksZd
*
yaksXd
-
yaksZd
*
xaksXd
;
mixed
zaksYd
=
xaksZd
*
yaksXd
-
yaksZd
*
xaksXd
;
mixed
axlng
=
sqrt
(
xaksXd
*
xaksXd
+
yaksXd
*
yaksXd
+
zaksXd
*
zaksXd
);
mixed
axlng
=
SQRT
(
xaksXd
*
xaksXd
+
yaksXd
*
yaksXd
+
zaksXd
*
zaksXd
);
mixed
aylng
=
sqrt
(
xaksYd
*
xaksYd
+
yaksYd
*
yaksYd
+
zaksYd
*
zaksYd
);
mixed
aylng
=
SQRT
(
xaksYd
*
xaksYd
+
yaksYd
*
yaksYd
+
zaksYd
*
zaksYd
);
mixed
azlng
=
sqrt
(
xaksZd
*
xaksZd
+
yaksZd
*
yaksZd
+
zaksZd
*
zaksZd
);
mixed
azlng
=
SQRT
(
xaksZd
*
xaksZd
+
yaksZd
*
yaksZd
+
zaksZd
*
zaksZd
);
mixed
trns11
=
xaksXd
/
axlng
;
mixed
trns11
=
xaksXd
/
axlng
;
mixed
trns21
=
yaksXd
/
axlng
;
mixed
trns21
=
yaksXd
/
axlng
;
mixed
trns31
=
zaksXd
/
axlng
;
mixed
trns31
=
zaksXd
/
axlng
;
...
@@ -440,13 +440,13 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
...
@@ -440,13 +440,13 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
// --- Step2 A2' ---
// --- Step2 A2' ---
float
rc
=
0.5
f
*
params
.
y
;
float
rc
=
0.5
f
*
params
.
y
;
mixed
rb
=
sqrt
(
params
.
x
*
params
.
x
-
rc
*
rc
);
mixed
rb
=
SQRT
(
params
.
x
*
params
.
x
-
rc
*
rc
);
mixed
ra
=
rb
*
(
m1
+
m2
)
*
invTotalMass
;
mixed
ra
=
rb
*
(
m1
+
m2
)
*
invTotalMass
;
rb
-=
ra
;
rb
-=
ra
;
mixed
sinphi
=
za1d
/
ra
;
mixed
sinphi
=
za1d
/
ra
;
mixed
cosphi
=
sqrt
(
1
-
sinphi
*
sinphi
);
mixed
cosphi
=
SQRT
(
1
-
sinphi
*
sinphi
);
mixed
sinpsi
=
(
zb1d
-
zc1d
)
/
(
2
*
rc
*
cosphi
);
mixed
sinpsi
=
(
zb1d
-
zc1d
)
/
(
2
*
rc
*
cosphi
);
mixed
cospsi
=
sqrt
(
1
-
sinpsi
*
sinpsi
);
mixed
cospsi
=
SQRT
(
1
-
sinpsi
*
sinpsi
);
mixed
ya2d
=
ra
*
cosphi
;
mixed
ya2d
=
ra
*
cosphi
;
mixed
xb2d
=
-
rc
*
cospsi
;
mixed
xb2d
=
-
rc
*
cospsi
;
...
@@ -454,7 +454,7 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
...
@@ -454,7 +454,7 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed
yc2d
=
-
rb
*
cosphi
+
rc
*
sinpsi
*
sinphi
;
mixed
yc2d
=
-
rb
*
cosphi
+
rc
*
sinpsi
*
sinphi
;
mixed
xb2d2
=
xb2d
*
xb2d
;
mixed
xb2d2
=
xb2d
*
xb2d
;
mixed
hh2
=
4.0
f
*
xb2d2
+
(
yb2d
-
yc2d
)
*
(
yb2d
-
yc2d
)
+
(
zb1d
-
zc1d
)
*
(
zb1d
-
zc1d
);
mixed
hh2
=
4.0
f
*
xb2d2
+
(
yb2d
-
yc2d
)
*
(
yb2d
-
yc2d
)
+
(
zb1d
-
zc1d
)
*
(
zb1d
-
zc1d
);
mixed
deltx
=
2.0
f
*
xb2d
+
sqrt
(
4.0
f
*
xb2d2
-
hh2
+
params
.
y
*
params
.
y
);
mixed
deltx
=
2.0
f
*
xb2d
+
SQRT
(
4.0
f
*
xb2d2
-
hh2
+
params
.
y
*
params
.
y
);
xb2d
-=
deltx
*
0.5
f
;
xb2d
-=
deltx
*
0.5
f
;
// --- Step3 al,be,ga ---
// --- Step3 al,be,ga ---
...
@@ -464,11 +464,11 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
...
@@ -464,11 +464,11 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed
gamma
=
xb0d
*
yb1d
-
xb1d
*
yb0d
+
xc0d
*
yc1d
-
xc1d
*
yc0d
;
mixed
gamma
=
xb0d
*
yb1d
-
xb1d
*
yb0d
+
xc0d
*
yc1d
-
xc1d
*
yc0d
;
mixed
al2be2
=
alpha
*
alpha
+
beta
*
beta
;
mixed
al2be2
=
alpha
*
alpha
+
beta
*
beta
;
mixed
sintheta
=
(
alpha
*
gamma
-
beta
*
sqrt
(
al2be2
-
gamma
*
gamma
))
/
al2be2
;
mixed
sintheta
=
(
alpha
*
gamma
-
beta
*
SQRT
(
al2be2
-
gamma
*
gamma
))
/
al2be2
;
// --- Step4 A3' ---
// --- Step4 A3' ---
mixed
costheta
=
sqrt
(
1
-
sintheta
*
sintheta
);
mixed
costheta
=
SQRT
(
1
-
sintheta
*
sintheta
);
mixed
xa3d
=
-
ya2d
*
sintheta
;
mixed
xa3d
=
-
ya2d
*
sintheta
;
mixed
ya3d
=
ya2d
*
costheta
;
mixed
ya3d
=
ya2d
*
costheta
;
mixed
za3d
=
za1d
;
mixed
za3d
=
za1d
;
...
@@ -534,9 +534,9 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
...
@@ -534,9 +534,9 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
mixed3
eAB
=
make_mixed3
(
apos1
.
x
-
apos0
.
x
,
apos1
.
y
-
apos0
.
y
,
apos1
.
z
-
apos0
.
z
);
mixed3
eAB
=
make_mixed3
(
apos1
.
x
-
apos0
.
x
,
apos1
.
y
-
apos0
.
y
,
apos1
.
z
-
apos0
.
z
);
mixed3
eBC
=
make_mixed3
(
apos2
.
x
-
apos1
.
x
,
apos2
.
y
-
apos1
.
y
,
apos2
.
z
-
apos1
.
z
);
mixed3
eBC
=
make_mixed3
(
apos2
.
x
-
apos1
.
x
,
apos2
.
y
-
apos1
.
y
,
apos2
.
z
-
apos1
.
z
);
mixed3
eCA
=
make_mixed3
(
apos0
.
x
-
apos2
.
x
,
apos0
.
y
-
apos2
.
y
,
apos0
.
z
-
apos2
.
z
);
mixed3
eCA
=
make_mixed3
(
apos0
.
x
-
apos2
.
x
,
apos0
.
y
-
apos2
.
y
,
apos0
.
z
-
apos2
.
z
);
eAB
*=
rsqrt
(
eAB
.
x
*
eAB
.
x
+
eAB
.
y
*
eAB
.
y
+
eAB
.
z
*
eAB
.
z
);
eAB
*=
RSQRT
(
eAB
.
x
*
eAB
.
x
+
eAB
.
y
*
eAB
.
y
+
eAB
.
z
*
eAB
.
z
);
eBC
*=
rsqrt
(
eBC
.
x
*
eBC
.
x
+
eBC
.
y
*
eBC
.
y
+
eBC
.
z
*
eBC
.
z
);
eBC
*=
RSQRT
(
eBC
.
x
*
eBC
.
x
+
eBC
.
y
*
eBC
.
y
+
eBC
.
z
*
eBC
.
z
);
eCA
*=
rsqrt
(
eCA
.
x
*
eCA
.
x
+
eCA
.
y
*
eCA
.
y
+
eCA
.
z
*
eCA
.
z
);
eCA
*=
RSQRT
(
eCA
.
x
*
eCA
.
x
+
eCA
.
y
*
eCA
.
y
+
eCA
.
z
*
eCA
.
z
);
mixed
vAB
=
(
v1
.
x
-
v0
.
x
)
*
eAB
.
x
+
(
v1
.
y
-
v0
.
y
)
*
eAB
.
y
+
(
v1
.
z
-
v0
.
z
)
*
eAB
.
z
;
mixed
vAB
=
(
v1
.
x
-
v0
.
x
)
*
eAB
.
x
+
(
v1
.
y
-
v0
.
y
)
*
eAB
.
y
+
(
v1
.
z
-
v0
.
z
)
*
eAB
.
z
;
mixed
vBC
=
(
v2
.
x
-
v1
.
x
)
*
eBC
.
x
+
(
v2
.
y
-
v1
.
y
)
*
eBC
.
y
+
(
v2
.
z
-
v1
.
z
)
*
eBC
.
z
;
mixed
vBC
=
(
v2
.
x
-
v1
.
x
)
*
eBC
.
x
+
(
v2
.
y
-
v1
.
y
)
*
eBC
.
y
+
(
v2
.
z
-
v1
.
z
)
*
eBC
.
z
;
mixed
vCA
=
(
v0
.
x
-
v2
.
x
)
*
eCA
.
x
+
(
v0
.
y
-
v2
.
y
)
*
eCA
.
y
+
(
v0
.
z
-
v2
.
z
)
*
eCA
.
z
;
mixed
vCA
=
(
v0
.
x
-
v2
.
x
)
*
eCA
.
x
+
(
v0
.
y
-
v2
.
y
)
*
eCA
.
y
+
(
v0
.
z
-
v2
.
z
)
*
eCA
.
z
;
...
@@ -574,7 +574,8 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
...
@@ -574,7 +574,8 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
/**
/**
* Compute the direction each CCMA constraint is pointing in. This is called once at the beginning of constraint evaluation.
* Compute the direction each CCMA constraint is pointing in. This is called once at the beginning of constraint evaluation.
*/
*/
extern
"C"
__global__
void
computeCCMAConstraintDirections
(
const
int2
*
__restrict__
constraintAtoms
,
mixed4
*
__restrict__
constraintDistance
,
const
real4
*
__restrict__
atomPositions
,
const
real4
*
__restrict__
posqCorrection
)
{
extern
"C"
__global__
void
computeCCMAConstraintDirections
(
const
int2
*
__restrict__
constraintAtoms
,
mixed4
*
__restrict__
constraintDistance
,
const
real4
*
__restrict__
atomPositions
,
const
real4
*
__restrict__
posqCorrection
,
int
*
__restrict__
converged
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_CCMA_CONSTRAINTS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_CCMA_CONSTRAINTS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Compute the direction for this constraint.
// Compute the direction for this constraint.
...
@@ -587,6 +588,10 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric
...
@@ -587,6 +588,10 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric
dir
.
z
=
oldPos1
.
z
-
oldPos2
.
z
;
dir
.
z
=
oldPos1
.
z
-
oldPos2
.
z
;
constraintDistance
[
index
]
=
dir
;
constraintDistance
[
index
]
=
dir
;
}
}
if
(
threadIdx
.
x
==
0
&&
blockIdx
.
x
==
0
)
{
converged
[
0
]
=
1
;
converged
[
1
]
=
0
;
}
}
}
/**
/**
...
@@ -605,6 +610,7 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
...
@@ -605,6 +610,7 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
__syncthreads
();
__syncthreads
();
mixed
lowerTol
=
1
-
2
*
tol
+
tol
*
tol
;
mixed
lowerTol
=
1
-
2
*
tol
+
tol
*
tol
;
mixed
upperTol
=
1
+
2
*
tol
+
tol
*
tol
;
mixed
upperTol
=
1
+
2
*
tol
+
tol
*
tol
;
bool
threadConverged
=
true
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_CCMA_CONSTRAINTS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_CCMA_CONSTRAINTS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Compute the force due to this constraint.
// Compute the force due to this constraint.
...
@@ -620,14 +626,13 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
...
@@ -620,14 +626,13 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
mixed
dist2
=
dir
.
w
*
dir
.
w
;
mixed
dist2
=
dir
.
w
*
dir
.
w
;
mixed
diff
=
dist2
-
rp2
;
mixed
diff
=
dist2
-
rp2
;
delta1
[
index
]
=
(
rrpr
>
d_ij2
*
1e-6
f
?
reducedMass
[
index
]
*
diff
/
rrpr
:
0.0
f
);
delta1
[
index
]
=
(
rrpr
>
d_ij2
*
1e-6
f
?
reducedMass
[
index
]
*
diff
/
rrpr
:
0.0
f
);
threadConverged
&=
(
rp2
>
lowerTol
*
dist2
&&
rp2
<
upperTol
*
dist2
);
// See whether it has converged.
}
if
(
groupConverged
&&
!
threadConverged
)
if
(
groupConverged
&&
(
rp2
<
lowerTol
*
dist2
||
rp2
>
upperTol
*
dist2
))
{
groupConverged
=
0
;
groupConverged
=
0
;
__syncthreads
();
if
(
threadIdx
.
x
==
0
&&
!
groupConverged
)
converged
[
iteration
%
2
]
=
0
;
converged
[
iteration
%
2
]
=
0
;
}
}
}
}
/**
/**
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment