Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
8cda1b79
Commit
8cda1b79
authored
Oct 04, 2012
by
Peter Eastman
Browse files
Finished implementing mixed precision model
parent
9ad85ebd
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
159 additions
and
88 deletions
+159
-88
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+34
-24
platforms/cuda/src/kernels/brownian.cu
platforms/cuda/src/kernels/brownian.cu
+23
-10
platforms/cuda/src/kernels/constraints.cu
platforms/cuda/src/kernels/constraints.cu
+17
-6
platforms/cuda/src/kernels/customIntegrator.cu
platforms/cuda/src/kernels/customIntegrator.cu
+37
-10
platforms/cuda/src/kernels/customIntegratorGlobal.cu
platforms/cuda/src/kernels/customIntegratorGlobal.cu
+1
-1
platforms/cuda/src/kernels/customIntegratorPerDof.cu
platforms/cuda/src/kernels/customIntegratorPerDof.cu
+34
-9
platforms/cuda/src/kernels/integrationUtilities.cu
platforms/cuda/src/kernels/integrationUtilities.cu
+13
-28
No files found.
platforms/cuda/src/CudaKernels.cpp
View file @
8cda1b79
...
...
@@ -378,7 +378,8 @@ void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
CudaIntegrationUtilities
&
integration
=
cu
.
getIntegrationUtilities
();
cu
.
clearBuffer
(
integration
.
getPosDelta
());
integration
.
applyConstraints
(
tol
);
void
*
args
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getIntegrationUtilities
().
getPosDelta
().
getDevicePointer
()};
CUdeviceptr
posCorrection
=
(
cu
.
getUseMixedPrecision
()
?
cu
.
getPosqCorrection
().
getDevicePointer
()
:
0
);
void
*
args
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
posCorrection
,
&
cu
.
getIntegrationUtilities
().
getPosDelta
().
getDevicePointer
()};
cu
.
executeKernel
(
applyDeltasKernel
,
args
,
cu
.
getNumAtoms
());
integration
.
computeVirtualSites
();
}
...
...
@@ -4144,12 +4145,13 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
float
stepSizeFloat
=
(
float
)
stepSize
;
float
tauDtFloat
=
(
float
)
tauDt
;
float
noiseFloat
=
(
float
)
noise
;
bool
useDouble
=
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
();
// Call the first integration kernel.
int
randomIndex
=
integration
.
prepareRandomNumbers
(
cu
.
getPaddedNumAtoms
());
void
*
args1
[]
=
{
cu
.
getU
seDouble
Precision
()
?
(
void
*
)
&
tauDt
:
(
void
*
)
&
tauDtFloat
,
cu
.
getU
seDouble
Precision
()
?
(
void
*
)
&
noise
:
(
void
*
)
&
noiseFloat
,
void
*
args1
[]
=
{
u
seDouble
?
(
void
*
)
&
tauDt
:
(
void
*
)
&
tauDtFloat
,
u
seDouble
?
(
void
*
)
&
noise
:
(
void
*
)
&
noiseFloat
,
&
cu
.
getForce
().
getDevicePointer
(),
&
integration
.
getPosDelta
().
getDevicePointer
(),
&
cu
.
getVelm
().
getDevicePointer
(),
&
integration
.
getRandom
().
getDevicePointer
(),
&
randomIndex
};
cu
.
executeKernel
(
kernel1
,
args1
,
numAtoms
);
...
...
@@ -4160,8 +4162,9 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
// Call the second integration kernel.
void
*
args2
[]
=
{
cu
.
getUseDoublePrecision
()
?
(
void
*
)
&
stepSize
:
(
void
*
)
&
stepSizeFloat
,
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getVelm
().
getDevicePointer
(),
&
integration
.
getPosDelta
().
getDevicePointer
()};
CUdeviceptr
posCorrection
=
(
cu
.
getUseMixedPrecision
()
?
cu
.
getPosqCorrection
().
getDevicePointer
()
:
0
);
void
*
args2
[]
=
{
useDouble
?
(
void
*
)
&
stepSize
:
(
void
*
)
&
stepSizeFloat
,
&
cu
.
getPosq
().
getDevicePointer
(),
&
posCorrection
,
&
cu
.
getVelm
().
getDevicePointer
(),
&
integration
.
getPosDelta
().
getDevicePointer
()};
cu
.
executeKernel
(
kernel2
,
args2
,
numAtoms
);
integration
.
computeVirtualSites
();
...
...
@@ -4354,7 +4357,7 @@ public:
return
;
int
numAtoms
=
cu
.
getNumAtoms
();
const
vector
<
int
>&
order
=
cu
.
getAtomIndex
();
if
(
cu
.
getUseDoublePrecision
())
{
if
(
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
()
)
{
if
(
deviceValuesAreCurrent
)
perDofValues
.
getParameterValues
(
localPerDofValuesDouble
);
vector
<
vector
<
double
>
>
swap
(
3
*
numAtoms
);
...
...
@@ -4422,11 +4425,11 @@ void CudaIntegrateCustomStepKernel::initialize(const System& system, const Custo
cu
.
setAsCurrent
();
cu
.
getIntegrationUtilities
().
initRandomNumberGenerator
(
integrator
.
getRandomNumberSeed
());
numGlobalVariables
=
integrator
.
getNumGlobalVariables
();
int
elementSize
=
(
cu
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
int
elementSize
=
(
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
globalValues
=
new
CudaArray
(
cu
,
max
(
1
,
numGlobalVariables
),
elementSize
,
"globalVariables"
);
sumBuffer
=
new
CudaArray
(
cu
,
3
*
system
.
getNumParticles
(),
elementSize
,
"sumBuffer"
);
energy
=
new
CudaArray
(
cu
,
1
,
e
lementSize
,
"energy"
);
perDofValues
=
new
CudaParameterSet
(
cu
,
integrator
.
getNumPerDofVariables
(),
3
*
system
.
getNumParticles
(),
"perDofVariables"
,
false
,
cu
.
getUseDoublePrecision
());
energy
=
new
CudaArray
(
cu
,
1
,
cu
.
getEnergyBuffer
().
getE
lementSize
()
,
"energy"
);
perDofValues
=
new
CudaParameterSet
(
cu
,
integrator
.
getNumPerDofVariables
(),
3
*
system
.
getNumParticles
(),
"perDofVariables"
,
false
,
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
()
);
cu
.
addReorderListener
(
new
ReorderListener
(
cu
,
*
perDofValues
,
localPerDofValuesFloat
,
localPerDofValuesDouble
,
deviceValuesAreCurrent
));
prevStepSize
=
-
1.0
;
SimTKOpenMMUtilities
::
setRandomNumberSeed
(
integrator
.
getRandomNumberSeed
());
...
...
@@ -4502,13 +4505,14 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
CudaIntegrationUtilities
&
integration
=
cu
.
getIntegrationUtilities
();
int
numAtoms
=
cu
.
getNumAtoms
();
int
numSteps
=
integrator
.
getNumComputations
();
bool
useDouble
=
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
();
if
(
!
hasInitializedKernels
)
{
hasInitializedKernels
=
true
;
// Initialize various data structures.
const
map
<
string
,
double
>&
params
=
context
.
getParameters
();
if
(
cu
.
getU
seDouble
Precision
()
)
{
if
(
u
seDouble
)
{
contextParameterValues
=
CudaArray
::
create
<
double
>
(
cu
,
max
(
1
,
(
int
)
params
.
size
()),
"contextParameters"
);
contextValuesDouble
.
resize
(
contextParameterValues
->
getSize
());
for
(
map
<
string
,
double
>::
const_iterator
iter
=
params
.
begin
();
iter
!=
params
.
end
();
++
iter
)
{
...
...
@@ -4674,9 +4678,9 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
compute
<<
createPerDofComputation
(
stepType
[
j
]
==
CustomIntegrator
::
ComputePerDof
?
variable
[
j
]
:
""
,
expression
[
j
],
i
,
integrator
,
forceName
[
j
],
energyName
[
j
]);
if
(
variable
[
j
]
==
"x"
)
{
if
(
storePosAsDelta
[
j
])
compute
<<
"posDelta[index] = convertFromDouble4(position-convertToDouble4(
posq[
index
]
));
\n
"
;
compute
<<
"posDelta[index] = convertFromDouble4(position-convertToDouble4(
loadPos(posq, posqCorrection,
index
)
));
\n
"
;
else
compute
<<
"
posq[
index
] =
convertFromDouble4(position);
\n
"
;
compute
<<
"
storePos(posq, posqCorrection,
index
,
convertFromDouble4(position)
)
;
\n
"
;
}
else
if
(
variable
[
j
]
==
"v"
)
compute
<<
"velm[index] = convertFromDouble4(velocity);
\n
"
;
...
...
@@ -4712,6 +4716,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
requiredUniform
[
step
]
=
numUniform
;
vector
<
void
*>
args1
;
args1
.
push_back
(
&
cu
.
getPosq
().
getDevicePointer
());
args1
.
push_back
(
NULL
);
args1
.
push_back
(
&
integration
.
getPosDelta
().
getDevicePointer
());
args1
.
push_back
(
&
cu
.
getVelm
().
getDevicePointer
());
args1
.
push_back
(
&
cu
.
getForce
().
getDevicePointer
());
...
...
@@ -4749,7 +4754,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
throw
OpenMMException
(
"Unknown global variable: "
+
variable
[
step
]);
defines
[
"SUM_BUFFER_SIZE"
]
=
cu
.
intToString
(
3
*
numAtoms
);
module
=
cu
.
createModule
(
CudaKernelSources
::
customIntegrator
,
defines
);
kernel
=
cu
.
getKernel
(
module
,
"compute
Sum"
);
kernel
=
cu
.
getKernel
(
module
,
useDouble
?
"computeDoubleSum"
:
"computeFloat
Sum"
);
kernels
[
step
].
push_back
(
kernel
);
kernelArgs
[
step
].
push_back
(
args2
);
}
...
...
@@ -4782,6 +4787,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
kernels
[
step
].
push_back
(
kernel
);
vector
<
void
*>
args
;
args
.
push_back
(
&
cu
.
getPosq
().
getDevicePointer
());
args
.
push_back
(
NULL
);
args
.
push_back
(
&
integration
.
getPosDelta
().
getDevicePointer
());
kernelArgs
[
step
].
push_back
(
args
);
}
...
...
@@ -4792,13 +4798,13 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
defines
[
"SUM_OUTPUT_INDEX"
]
=
"0"
;
defines
[
"SUM_BUFFER_SIZE"
]
=
cu
.
intToString
(
cu
.
getEnergyBuffer
().
getSize
());
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
customIntegrator
,
defines
);
sumEnergyKernel
=
cu
.
getKernel
(
module
,
"compute
Sum"
);
sumEnergyKernel
=
cu
.
getKernel
(
module
,
cu
.
getUseDoublePrecision
()
?
"computeDoubleSum"
:
"computeFloat
Sum"
);
}
// Make sure all values (variables, parameters, etc.) stored on the device are up to date.
if
(
!
deviceValuesAreCurrent
)
{
if
(
cu
.
getU
seDouble
Precision
()
)
if
(
u
seDouble
)
perDofValues
->
setParameterValues
(
localPerDofValuesDouble
);
else
perDofValues
->
setParameterValues
(
localPerDofValuesFloat
);
...
...
@@ -4807,7 +4813,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
localValuesAreCurrent
=
false
;
double
stepSize
=
integrator
.
getStepSize
();
if
(
stepSize
!=
prevStepSize
)
{
if
(
cu
.
getU
seDouble
Precision
()
)
{
if
(
u
seDouble
)
{
double
size
[]
=
{
0
,
stepSize
};
integration
.
getStepSize
().
upload
(
size
);
}
...
...
@@ -4818,7 +4824,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
prevStepSize
=
stepSize
;
}
bool
paramsChanged
=
false
;
if
(
cu
.
getU
seDouble
Precision
()
)
{
if
(
u
seDouble
)
{
for
(
int
i
=
0
;
i
<
(
int
)
parameterNames
.
size
();
i
++
)
{
double
value
=
context
.
getParameter
(
parameterNames
[
i
]);
if
(
value
!=
contextValuesDouble
[
i
])
{
...
...
@@ -4844,6 +4850,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
// Loop over computation steps in the integrator and execute them.
void
*
randomArgs
[]
=
{
&
uniformRandoms
->
getDevicePointer
(),
&
randomSeed
->
getDevicePointer
()};
CUdeviceptr
posCorrection
=
(
cu
.
getUseMixedPrecision
()
?
cu
.
getPosqCorrection
().
getDevicePointer
()
:
0
);
for
(
int
i
=
0
;
i
<
numSteps
;
i
++
)
{
if
((
needsForces
[
i
]
||
needsEnergy
[
i
])
&&
(
!
forcesAreValid
||
context
.
getLastForceGroups
()
!=
forceGroup
[
i
]))
{
// Recompute forces and/or energy. Figure out what is actually needed
...
...
@@ -4872,7 +4879,8 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
}
if
(
stepType
[
i
]
==
CustomIntegrator
::
ComputePerDof
&&
!
merged
[
i
])
{
int
randomIndex
=
integration
.
prepareRandomNumbers
(
requiredGaussian
[
i
]);
kernelArgs
[
i
][
0
][
9
]
=
&
randomIndex
;
kernelArgs
[
i
][
0
][
1
]
=
&
posCorrection
;
kernelArgs
[
i
][
0
][
10
]
=
&
randomIndex
;
if
(
requiredUniform
[
i
]
>
0
)
cu
.
executeKernel
(
randomKernel
,
&
randomArgs
[
0
],
numAtoms
);
cu
.
executeKernel
(
kernels
[
i
][
0
],
&
kernelArgs
[
i
][
0
][
0
],
numAtoms
);
...
...
@@ -4886,7 +4894,8 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
}
else
if
(
stepType
[
i
]
==
CustomIntegrator
::
ComputeSum
)
{
int
randomIndex
=
integration
.
prepareRandomNumbers
(
requiredGaussian
[
i
]);
kernelArgs
[
i
][
0
][
9
]
=
&
randomIndex
;
kernelArgs
[
i
][
0
][
1
]
=
&
posCorrection
;
kernelArgs
[
i
][
0
][
10
]
=
&
randomIndex
;
if
(
requiredUniform
[
i
]
>
0
)
cu
.
executeKernel
(
randomKernel
,
&
randomArgs
[
0
],
numAtoms
);
cu
.
executeKernel
(
kernels
[
i
][
0
],
&
kernelArgs
[
i
][
0
][
0
],
numAtoms
);
...
...
@@ -4898,6 +4907,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
}
else
if
(
stepType
[
i
]
==
CustomIntegrator
::
ConstrainPositions
)
{
cu
.
getIntegrationUtilities
().
applyConstraints
(
integrator
.
getConstraintTolerance
());
kernelArgs
[
i
][
0
][
1
]
=
&
posCorrection
;
cu
.
executeKernel
(
kernels
[
i
][
0
],
&
kernelArgs
[
i
][
0
][
0
],
numAtoms
);
cu
.
getIntegrationUtilities
().
computeVirtualSites
();
}
...
...
@@ -4918,7 +4928,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
void
CudaIntegrateCustomStepKernel
::
recordChangedParameters
(
ContextImpl
&
context
)
{
if
(
!
modifiesParameters
)
return
;
if
(
cu
.
getUseDoublePrecision
())
{
if
(
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
()
)
{
contextParameterValues
->
download
(
contextValuesDouble
);
for
(
int
i
=
0
;
i
<
(
int
)
parameterNames
.
size
();
i
++
)
{
double
value
=
context
.
getParameter
(
parameterNames
[
i
]);
...
...
@@ -4940,7 +4950,7 @@ void CudaIntegrateCustomStepKernel::getGlobalVariables(ContextImpl& context, vec
values
.
resize
(
numGlobalVariables
);
if
(
numGlobalVariables
==
0
)
return
;
if
(
cu
.
getUseDoublePrecision
())
if
(
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
()
)
globalValues
->
download
(
values
);
else
{
vector
<
float
>
buffer
;
...
...
@@ -4953,7 +4963,7 @@ void CudaIntegrateCustomStepKernel::getGlobalVariables(ContextImpl& context, vec
void
CudaIntegrateCustomStepKernel
::
setGlobalVariables
(
ContextImpl
&
context
,
const
vector
<
double
>&
values
)
{
if
(
numGlobalVariables
==
0
)
return
;
if
(
cu
.
getUseDoublePrecision
())
if
(
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
()
)
globalValues
->
upload
(
values
);
else
{
vector
<
float
>
buffer
(
numGlobalVariables
);
...
...
@@ -4966,7 +4976,7 @@ void CudaIntegrateCustomStepKernel::setGlobalVariables(ContextImpl& context, con
void
CudaIntegrateCustomStepKernel
::
getPerDofVariable
(
ContextImpl
&
context
,
int
variable
,
vector
<
Vec3
>&
values
)
const
{
values
.
resize
(
perDofValues
->
getNumObjects
()
/
3
);
const
vector
<
int
>&
order
=
cu
.
getAtomIndex
();
if
(
cu
.
getUseDoublePrecision
())
{
if
(
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
()
)
{
if
(
!
localValuesAreCurrent
)
{
perDofValues
->
getParameterValues
(
localPerDofValuesDouble
);
localValuesAreCurrent
=
true
;
...
...
@@ -4988,7 +4998,7 @@ void CudaIntegrateCustomStepKernel::getPerDofVariable(ContextImpl& context, int
void
CudaIntegrateCustomStepKernel
::
setPerDofVariable
(
ContextImpl
&
context
,
int
variable
,
const
vector
<
Vec3
>&
values
)
{
const
vector
<
int
>&
order
=
cu
.
getAtomIndex
();
if
(
cu
.
getUseDoublePrecision
())
{
if
(
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
()
)
{
if
(
!
localValuesAreCurrent
)
{
perDofValues
->
getParameterValues
(
localPerDofValuesDouble
);
localValuesAreCurrent
=
true
;
...
...
platforms/cuda/src/kernels/brownian.cu
View file @
8cda1b79
...
...
@@ -2,12 +2,12 @@
* Perform the first step of Brownian integration.
*/
extern
"C"
__global__
void
integrateBrownianPart1
(
real
tauDeltaT
,
real
noiseAmplitude
,
const
long
long
*
__restrict__
force
,
real
4
*
__restrict__
posDelta
,
const
real
4
*
__restrict__
velm
,
const
float4
*
__restrict__
random
,
unsigned
int
randomIndex
)
{
extern
"C"
__global__
void
integrateBrownianPart1
(
mixed
tauDeltaT
,
mixed
noiseAmplitude
,
const
long
long
*
__restrict__
force
,
mixed
4
*
__restrict__
posDelta
,
const
mixed
4
*
__restrict__
velm
,
const
float4
*
__restrict__
random
,
unsigned
int
randomIndex
)
{
randomIndex
+=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
real
fscale
=
tauDeltaT
/
(
real
)
0xFFFFFFFF
;
const
mixed
fscale
=
tauDeltaT
/
(
mixed
)
0xFFFFFFFF
;
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_ATOMS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
real
invMass
=
velm
[
index
].
w
;
mixed
invMass
=
velm
[
index
].
w
;
if
(
invMass
!=
0
)
{
posDelta
[
index
].
x
=
fscale
*
invMass
*
force
[
index
]
+
noiseAmplitude
*
SQRT
(
invMass
)
*
random
[
randomIndex
].
x
;
posDelta
[
index
].
y
=
fscale
*
invMass
*
force
[
index
+
PADDED_NUM_ATOMS
]
+
noiseAmplitude
*
SQRT
(
invMass
)
*
random
[
randomIndex
].
y
;
...
...
@@ -21,17 +21,30 @@ extern "C" __global__ void integrateBrownianPart1(real tauDeltaT, real noiseAmpl
* Perform the second step of Brownian integration.
*/
extern
"C"
__global__
void
integrateBrownianPart2
(
real
deltaT
,
real4
*
posq
,
real4
*
velm
,
const
real
4
*
__restrict__
posDelta
)
{
const
real
oneOverDeltaT
=
RECIP
(
deltaT
);
extern
"C"
__global__
void
integrateBrownianPart2
(
mixed
deltaT
,
real4
*
posq
,
real4
*
__restrict__
posqCorrection
,
mixed4
*
velm
,
const
mixed
4
*
__restrict__
posDelta
)
{
const
mixed
oneOverDeltaT
=
RECIP
(
deltaT
);
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_ATOMS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
if
(
velm
[
index
].
w
!=
0
)
{
real
4
delta
=
posDelta
[
index
];
mixed
4
delta
=
posDelta
[
index
];
velm
[
index
].
x
=
oneOverDeltaT
*
delta
.
x
;
velm
[
index
].
y
=
oneOverDeltaT
*
delta
.
y
;
velm
[
index
].
z
=
oneOverDeltaT
*
delta
.
z
;
posq
[
index
].
x
=
posq
[
index
].
x
+
delta
.
x
;
posq
[
index
].
y
=
posq
[
index
].
y
+
delta
.
y
;
posq
[
index
].
z
=
posq
[
index
].
z
+
delta
.
z
;
#ifdef USE_MIXED_PRECISION
real4
pos1
=
posq
[
index
];
real4
pos2
=
posqCorrection
[
index
];
mixed4
pos
=
make_mixed4
(
pos1
.
x
+
(
mixed
)
pos2
.
x
,
pos1
.
y
+
(
mixed
)
pos2
.
y
,
pos1
.
z
+
(
mixed
)
pos2
.
z
,
pos1
.
w
);
#else
real4
pos
=
posq
[
index
];
#endif
pos
.
x
+=
delta
.
x
;
pos
.
y
+=
delta
.
y
;
pos
.
z
+=
delta
.
z
;
#ifdef USE_MIXED_PRECISION
posq
[
index
]
=
make_real4
((
real
)
pos
.
x
,
(
real
)
pos
.
y
,
(
real
)
pos
.
z
,
(
real
)
pos
.
w
);
posqCorrection
[
index
]
=
make_real4
(
pos
.
x
-
(
real
)
pos
.
x
,
pos
.
y
-
(
real
)
pos
.
y
,
pos
.
z
-
(
real
)
pos
.
z
,
0
);
#else
posq
[
index
]
=
pos
;
#endif
}
}
}
platforms/cuda/src/kernels/constraints.cu
View file @
8cda1b79
extern
"C"
__global__
void
applyPositionDeltas
(
real4
*
__restrict__
posq
,
real4
*
__restrict__
posDelta
)
{
extern
"C"
__global__
void
applyPositionDeltas
(
real4
*
__restrict__
posq
,
real4
*
__restrict__
posqCorrection
,
mixed4
*
__restrict__
posDelta
)
{
for
(
unsigned
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_ATOMS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
real4
position
=
posq
[
index
];
position
.
x
+=
posDelta
[
index
].
x
;
position
.
y
+=
posDelta
[
index
].
y
;
position
.
z
+=
posDelta
[
index
].
z
;
posq
[
index
]
=
position
;
#ifdef USE_MIXED_PRECISION
real4
pos1
=
posq
[
index
];
real4
pos2
=
posqCorrection
[
index
];
mixed4
pos
=
make_mixed4
(
pos1
.
x
+
(
mixed
)
pos2
.
x
,
pos1
.
y
+
(
mixed
)
pos2
.
y
,
pos1
.
z
+
(
mixed
)
pos2
.
z
,
pos1
.
w
);
#else
mixed4
pos
=
posq
[
index
];
#endif
pos
.
x
+=
posDelta
[
index
].
x
;
pos
.
y
+=
posDelta
[
index
].
y
;
pos
.
z
+=
posDelta
[
index
].
z
;
#ifdef USE_MIXED_PRECISION
posq
[
index
]
=
make_real4
((
real
)
pos
.
x
,
(
real
)
pos
.
y
,
(
real
)
pos
.
z
,
(
real
)
pos
.
w
);
posqCorrection
[
index
]
=
make_real4
(
pos
.
x
-
(
real
)
pos
.
x
,
pos
.
y
-
(
real
)
pos
.
y
,
pos
.
z
-
(
real
)
pos
.
z
,
0
);
#else
posq
[
index
]
=
pos
;
#endif
}
}
platforms/cuda/src/kernels/customIntegrator.cu
View file @
8cda1b79
extern
"C"
__global__
void
computeSum
(
const
real
*
__restrict__
sumBuffer
,
real
*
result
)
{
__shared__
real
tempBuffer
[
WORK_GROUP_SIZE
];
extern
"C"
__global__
void
compute
Float
Sum
(
const
float
*
__restrict__
sumBuffer
,
float
*
result
)
{
__shared__
float
tempBuffer
[
WORK_GROUP_SIZE
];
const
unsigned
int
thread
=
threadIdx
.
x
;
real
sum
=
0
;
float
sum
=
0
;
for
(
unsigned
int
index
=
thread
;
index
<
SUM_BUFFER_SIZE
;
index
+=
blockDim
.
x
)
sum
+=
sumBuffer
[
index
];
tempBuffer
[
thread
]
=
sum
;
...
...
@@ -14,14 +14,41 @@ extern "C" __global__ void computeSum(const real* __restrict__ sumBuffer, real*
result
[
SUM_OUTPUT_INDEX
]
=
tempBuffer
[
0
];
}
extern
"C"
__global__
void
applyPositionDeltas
(
real4
*
__restrict__
posq
,
real4
*
__restrict__
posDelta
)
{
extern
"C"
__global__
void
computeDoubleSum
(
const
double
*
__restrict__
sumBuffer
,
double
*
result
)
{
__shared__
double
tempBuffer
[
WORK_GROUP_SIZE
];
const
unsigned
int
thread
=
threadIdx
.
x
;
double
sum
=
0
;
for
(
unsigned
int
index
=
thread
;
index
<
SUM_BUFFER_SIZE
;
index
+=
blockDim
.
x
)
sum
+=
sumBuffer
[
index
];
tempBuffer
[
thread
]
=
sum
;
for
(
int
i
=
1
;
i
<
WORK_GROUP_SIZE
;
i
*=
2
)
{
__syncthreads
();
if
(
thread
%
(
i
*
2
)
==
0
&&
thread
+
i
<
WORK_GROUP_SIZE
)
tempBuffer
[
thread
]
+=
tempBuffer
[
thread
+
i
];
}
if
(
thread
==
0
)
result
[
SUM_OUTPUT_INDEX
]
=
tempBuffer
[
0
];
}
extern
"C"
__global__
void
applyPositionDeltas
(
real4
*
__restrict__
posq
,
real4
*
__restrict__
posqCorrection
,
mixed4
*
__restrict__
posDelta
)
{
for
(
unsigned
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_ATOMS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
real4
position
=
posq
[
index
];
position
.
x
+=
posDelta
[
index
].
x
;
position
.
y
+=
posDelta
[
index
].
y
;
position
.
z
+=
posDelta
[
index
].
z
;
posq
[
index
]
=
position
;
posDelta
[
index
]
=
make_real4
(
0
,
0
,
0
,
0
);
#ifdef USE_MIXED_PRECISION
real4
pos1
=
posq
[
index
];
real4
pos2
=
posqCorrection
[
index
];
mixed4
pos
=
make_mixed4
(
pos1
.
x
+
(
mixed
)
pos2
.
x
,
pos1
.
y
+
(
mixed
)
pos2
.
y
,
pos1
.
z
+
(
mixed
)
pos2
.
z
,
pos1
.
w
);
#else
real4
pos
=
posq
[
index
];
#endif
pos
.
x
+=
posDelta
[
index
].
x
;
pos
.
y
+=
posDelta
[
index
].
y
;
pos
.
z
+=
posDelta
[
index
].
z
;
#ifdef USE_MIXED_PRECISION
posq
[
index
]
=
make_real4
((
real
)
pos
.
x
,
(
real
)
pos
.
y
,
(
real
)
pos
.
z
,
(
real
)
pos
.
w
);
posqCorrection
[
index
]
=
make_real4
(
pos
.
x
-
(
real
)
pos
.
x
,
pos
.
y
-
(
real
)
pos
.
y
,
pos
.
z
-
(
real
)
pos
.
z
,
0
);
#else
posq
[
index
]
=
pos
;
#endif
posDelta
[
index
]
=
make_mixed4
(
0
,
0
,
0
,
0
);
}
}
...
...
platforms/cuda/src/kernels/customIntegratorGlobal.cu
View file @
8cda1b79
extern
"C"
__global__
void
computeGlobal
(
real
2
*
__restrict__
dt
,
real
*
__restrict__
globals
,
real
*
__restrict__
params
,
extern
"C"
__global__
void
computeGlobal
(
mixed
2
*
__restrict__
dt
,
mixed
*
__restrict__
globals
,
mixed
*
__restrict__
params
,
float
uniform
,
float
gaussian
,
const
real
*
__restrict__
energy
)
{
COMPUTE_STEP
}
platforms/cuda/src/kernels/customIntegratorPerDof.cu
View file @
8cda1b79
inline
__device__
double4
convertToDouble4
(
real4
a
)
{
/**
* Load the position of a particle.
*/
inline
__device__
mixed4
loadPos
(
const
real4
*
__restrict__
posq
,
const
real4
*
__restrict__
posqCorrection
,
int
index
)
{
#ifdef USE_MIXED_PRECISION
real4
pos1
=
posq
[
index
];
real4
pos2
=
posqCorrection
[
index
];
return
make_mixed4
(
pos1
.
x
+
(
mixed
)
pos2
.
x
,
pos1
.
y
+
(
mixed
)
pos2
.
y
,
pos1
.
z
+
(
mixed
)
pos2
.
z
,
pos1
.
w
);
#else
return
posq
[
index
];
#endif
}
/**
* Store the position of a particle.
*/
inline
__device__
void
storePos
(
real4
*
__restrict__
posq
,
real4
*
__restrict__
posqCorrection
,
int
index
,
mixed4
pos
)
{
#ifdef USE_MIXED_PRECISION
posq
[
index
]
=
make_real4
((
real
)
pos
.
x
,
(
real
)
pos
.
y
,
(
real
)
pos
.
z
,
(
real
)
pos
.
w
);
posqCorrection
[
index
]
=
make_real4
(
pos
.
x
-
(
real
)
pos
.
x
,
pos
.
y
-
(
real
)
pos
.
y
,
pos
.
z
-
(
real
)
pos
.
z
,
0
);
#else
posq
[
index
]
=
pos
;
#endif
}
inline
__device__
double4
convertToDouble4
(
mixed4
a
)
{
return
make_double4
(
a
.
x
,
a
.
y
,
a
.
z
,
a
.
w
);
}
inline
__device__
real
4
convertFromDouble4
(
double4
a
)
{
return
make_
real
4
(
a
.
x
,
a
.
y
,
a
.
z
,
a
.
w
);
inline
__device__
mixed
4
convertFromDouble4
(
double4
a
)
{
return
make_
mixed
4
(
a
.
x
,
a
.
y
,
a
.
z
,
a
.
w
);
}
extern
"C"
__global__
void
computePerDof
(
real4
*
__restrict__
posq
,
real4
*
__restrict__
pos
Delta
,
real
4
*
__restrict__
velm
,
const
long
long
*
__restrict__
force
,
const
real
2
*
__restrict__
dt
,
const
real
*
__restrict__
globals
,
const
real
*
__restrict__
params
,
real
*
__restrict__
sum
,
const
float4
*
__restrict__
gaussianValues
,
extern
"C"
__global__
void
computePerDof
(
real4
*
__restrict__
posq
,
real4
*
__restrict__
pos
qCorrection
,
mixed
4
*
__restrict__
posDelta
,
mixed4
*
__restrict__
velm
,
const
long
long
*
__restrict__
force
,
const
mixed
2
*
__restrict__
dt
,
const
mixed
*
__restrict__
globals
,
const
mixed
*
__restrict__
params
,
mixed
*
__restrict__
sum
,
const
float4
*
__restrict__
gaussianValues
,
unsigned
int
randomIndex
,
const
float4
*
__restrict__
uniformValues
,
const
real
*
__restrict__
energy
PARAMETER_ARGUMENTS
)
{
real
stepSize
=
dt
[
0
].
y
;
mixed
stepSize
=
dt
[
0
].
y
;
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
randomIndex
+=
index
;
const
double
forceScale
=
1.0
/
0xFFFFFFFF
;
while
(
index
<
NUM_ATOMS
)
{
#ifdef LOAD_POS_AS_DELTA
double4
position
=
convertToDouble4
(
posq
[
index
]
+
posDelta
[
index
]);
double4
position
=
convertToDouble4
(
loadPos
(
posq
,
posqCorrection
,
index
)
+
posDelta
[
index
]);
#else
double4
position
=
convertToDouble4
(
posq
[
index
]
);
double4
position
=
convertToDouble4
(
loadPos
(
posq
,
posqCorrection
,
index
)
);
#endif
double4
velocity
=
convertToDouble4
(
velm
[
index
]);
double4
f
=
make_double4
(
forceScale
*
force
[
index
],
forceScale
*
force
[
index
+
PADDED_NUM_ATOMS
],
forceScale
*
force
[
index
+
PADDED_NUM_ATOMS
*
2
],
0.0
);
...
...
platforms/cuda/src/kernels/integrationUtilities.cu
View file @
8cda1b79
...
...
@@ -779,10 +779,11 @@ inline __device__ real3 loadForce(int index, long long* __restrict__ force) {
return
make_real3
(
scale
*
force
[
index
],
scale
*
force
[
index
+
PADDED_NUM_ATOMS
],
scale
*
force
[
index
+
PADDED_NUM_ATOMS
*
2
]);
}
inline
__device__
void
storeForce
(
int
index
,
long
long
*
__restrict__
force
,
real3
value
)
{
force
[
index
]
=
(
long
long
)
(
value
.
x
*
0xFFFFFFFF
);
force
[
index
+
PADDED_NUM_ATOMS
]
=
(
long
long
)
(
value
.
y
*
0xFFFFFFFF
);
force
[
index
+
2
*
PADDED_NUM_ATOMS
]
=
(
long
long
)
(
value
.
z
*
0xFFFFFFFF
);
inline
__device__
void
addForce
(
int
index
,
long
long
*
__restrict__
force
,
real3
value
)
{
unsigned
long
long
*
f
=
(
unsigned
long
long
*
)
force
;
atomicAdd
(
&
f
[
index
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
value
.
x
*
0xFFFFFFFF
)));
atomicAdd
(
&
f
[
index
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
value
.
y
*
0xFFFFFFFF
)));
atomicAdd
(
&
f
[
index
+
PADDED_NUM_ATOMS
*
2
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
value
.
z
*
0xFFFFFFFF
)));
}
/**
...
...
@@ -799,12 +800,8 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
int4
atoms
=
avg2Atoms
[
index
];
real2
weights
=
avg2Weights
[
index
];
real3
f
=
loadForce
(
atoms
.
x
,
force
);
real3
f1
=
loadForce
(
atoms
.
y
,
force
);
real3
f2
=
loadForce
(
atoms
.
z
,
force
);
f1
+=
f
*
weights
.
x
;
f2
+=
f
*
weights
.
y
;
storeForce
(
atoms
.
y
,
force
,
f1
);
storeForce
(
atoms
.
z
,
force
,
f2
);
addForce
(
atoms
.
y
,
force
,
f
*
weights
.
x
);
addForce
(
atoms
.
z
,
force
,
f
*
weights
.
y
);
}
// Three particle average sites.
...
...
@@ -813,15 +810,9 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
int4
atoms
=
avg3Atoms
[
index
];
real4
weights
=
avg3Weights
[
index
];
real3
f
=
loadForce
(
atoms
.
x
,
force
);
real3
f1
=
loadForce
(
atoms
.
y
,
force
);
real3
f2
=
loadForce
(
atoms
.
z
,
force
);
real3
f3
=
loadForce
(
atoms
.
w
,
force
);
f1
+=
f
*
weights
.
x
;
f2
+=
f
*
weights
.
y
;
f3
+=
f
*
weights
.
z
;
storeForce
(
atoms
.
y
,
force
,
f1
);
storeForce
(
atoms
.
z
,
force
,
f2
);
storeForce
(
atoms
.
w
,
force
,
f3
);
addForce
(
atoms
.
y
,
force
,
f
*
weights
.
x
);
addForce
(
atoms
.
z
,
force
,
f
*
weights
.
y
);
addForce
(
atoms
.
w
,
force
,
f
*
weights
.
z
);
}
// Out of plane sites.
...
...
@@ -835,20 +826,14 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
mixed4
v12
=
pos2
-
pos1
;
mixed4
v13
=
pos3
-
pos1
;
real3
f
=
loadForce
(
atoms
.
x
,
force
);
real3
f1
=
loadForce
(
atoms
.
y
,
force
);
real3
f2
=
loadForce
(
atoms
.
z
,
force
);
real3
f3
=
loadForce
(
atoms
.
w
,
force
);
real3
fp2
=
make_real3
((
real
)
(
weights
.
x
*
f
.
x
-
weights
.
z
*
v13
.
z
*
f
.
y
+
weights
.
z
*
v13
.
y
*
f
.
z
),
(
real
)
(
weights
.
z
*
v13
.
z
*
f
.
x
+
weights
.
x
*
f
.
y
-
weights
.
z
*
v13
.
x
*
f
.
z
),
(
real
)
(
-
weights
.
z
*
v13
.
y
*
f
.
x
+
weights
.
z
*
v13
.
x
*
f
.
y
+
weights
.
x
*
f
.
z
));
real3
fp3
=
make_real3
((
real
)
(
weights
.
y
*
f
.
x
+
weights
.
z
*
v12
.
z
*
f
.
y
-
weights
.
z
*
v12
.
y
*
f
.
z
),
(
real
)
(
-
weights
.
z
*
v12
.
z
*
f
.
x
+
weights
.
y
*
f
.
y
+
weights
.
z
*
v12
.
x
*
f
.
z
),
(
real
)
(
weights
.
z
*
v12
.
y
*
f
.
x
-
weights
.
z
*
v12
.
x
*
f
.
y
+
weights
.
y
*
f
.
z
));
f1
+=
f
-
fp2
-
fp3
;
f2
+=
fp2
;
f3
+=
fp3
;
storeForce
(
atoms
.
y
,
force
,
f1
);
storeForce
(
atoms
.
z
,
force
,
f2
);
storeForce
(
atoms
.
w
,
force
,
f3
);
addForce
(
atoms
.
y
,
force
,
f
-
fp2
-
fp3
);
addForce
(
atoms
.
z
,
force
,
fp2
);
addForce
(
atoms
.
w
,
force
,
fp3
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment