Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
cd6af26e
Commit
cd6af26e
authored
Nov 16, 2012
by
Peter Eastman
Browse files
RPMD supports mixed and double precision
parent
b8b2e1ef
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
484 additions
and
352 deletions
+484
-352
plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.cpp
plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.cpp
+143
-96
plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.h
plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.h
+1
-1
plugins/rpmd/platforms/cuda/src/kernels/rpmd.cu
plugins/rpmd/platforms/cuda/src/kernels/rpmd.cu
+80
-68
plugins/rpmd/platforms/cuda/tests/CMakeLists.txt
plugins/rpmd/platforms/cuda/tests/CMakeLists.txt
+5
-1
plugins/rpmd/platforms/cuda/tests/TestCudaRpmd.cpp
plugins/rpmd/platforms/cuda/tests/TestCudaRpmd.cpp
+3
-2
plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.cpp
plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.cpp
+171
-125
plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.h
plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.h
+2
-2
plugins/rpmd/platforms/opencl/src/kernels/rpmd.cl
plugins/rpmd/platforms/opencl/src/kernels/rpmd.cl
+70
-54
plugins/rpmd/platforms/opencl/tests/CMakeLists.txt
plugins/rpmd/platforms/opencl/tests/CMakeLists.txt
+5
-1
plugins/rpmd/platforms/opencl/tests/TestOpenCLRpmd.cpp
plugins/rpmd/platforms/opencl/tests/TestOpenCLRpmd.cpp
+4
-2
No files found.
plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.cpp
View file @
cd6af26e
...
...
@@ -70,6 +70,7 @@ CudaIntegrateRPMDStepKernel::~CudaIntegrateRPMDStepKernel() {
if
(
velocities
!=
NULL
)
delete
velocities
;
}
void
CudaIntegrateRPMDStepKernel
::
initialize
(
const
System
&
system
,
const
RPMDIntegrator
&
integrator
)
{
cu
.
getPlatformData
().
initializeContexts
(
system
);
numCopies
=
integrator
.
getNumCopies
();
...
...
@@ -80,20 +81,33 @@ void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDInt
if
(
numCopies
!=
findFFTDimension
(
numCopies
))
throw
OpenMMException
(
"RPMDIntegrator: the number of copies must be a multiple of powers of 2, 3, and 5."
);
int
paddedParticles
=
cu
.
getPaddedNumAtoms
();
bool
useDoublePrecision
=
(
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
());
int
elementSize
=
(
useDoublePrecision
?
sizeof
(
double4
)
:
sizeof
(
float4
));
forces
=
CudaArray
::
create
<
long
long
>
(
cu
,
numCopies
*
paddedParticles
*
3
,
"rpmdForces"
);
positions
=
CudaArray
::
create
<
float4
>
(
cu
,
numCopies
*
paddedParticles
,
"rpmdPositions"
);
velocities
=
CudaArray
::
create
<
float4
>
(
cu
,
numCopies
*
paddedParticles
,
"rpmdVelocities"
);
positions
=
new
CudaArray
(
cu
,
numCopies
*
paddedParticles
,
elementSize
,
"rpmdPositions"
);
velocities
=
new
CudaArray
(
cu
,
numCopies
*
paddedParticles
,
elementSize
,
"rpmdVelocities"
);
cu
.
getIntegrationUtilities
().
initRandomNumberGenerator
((
unsigned
int
)
integrator
.
getRandomNumberSeed
());
// Fill in the posq and velm arrays with safe values to avoid a risk of nans.
vector
<
float4
>
temp
(
positions
->
getSize
());
for
(
int
i
=
0
;
i
<
positions
->
getSize
();
i
++
)
temp
[
i
]
=
make_float4
(
0
,
0
,
0
,
0
);
positions
->
upload
(
temp
);
for
(
int
i
=
0
;
i
<
velocities
->
getSize
();
i
++
)
temp
[
i
]
=
make_float4
(
0
,
0
,
0
,
1
);
velocities
->
upload
(
temp
);
if
(
useDoublePrecision
)
{
vector
<
double4
>
temp
(
positions
->
getSize
());
for
(
int
i
=
0
;
i
<
positions
->
getSize
();
i
++
)
temp
[
i
]
=
make_double4
(
0
,
0
,
0
,
0
);
positions
->
upload
(
temp
);
for
(
int
i
=
0
;
i
<
velocities
->
getSize
();
i
++
)
temp
[
i
]
=
make_double4
(
0
,
0
,
0
,
1
);
velocities
->
upload
(
temp
);
}
else
{
vector
<
float4
>
temp
(
positions
->
getSize
());
for
(
int
i
=
0
;
i
<
positions
->
getSize
();
i
++
)
temp
[
i
]
=
make_float4
(
0
,
0
,
0
,
0
);
positions
->
upload
(
temp
);
for
(
int
i
=
0
;
i
<
velocities
->
getSize
();
i
++
)
temp
[
i
]
=
make_float4
(
0
,
0
,
0
,
1
);
velocities
->
upload
(
temp
);
}
// Create kernels.
...
...
@@ -114,8 +128,9 @@ void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDInt
pileKernel
=
cu
.
getKernel
(
module
,
"applyPileThermostat"
);
stepKernel
=
cu
.
getKernel
(
module
,
"integrateStep"
);
velocitiesKernel
=
cu
.
getKernel
(
module
,
"advanceVelocities"
);
copyToContextKernel
=
cu
.
getKernel
(
module
,
"copyToContext"
);
copyFromContextKernel
=
cu
.
getKernel
(
module
,
"copyFromContext"
);
copyPositionsToContextKernel
=
cu
.
getKernel
(
module
,
"copyPositionsToContext"
);
copyVelocitiesToContextKernel
=
cu
.
getKernel
(
module
,
"copyVelocitiesToContext"
);
copyForcesFromContextKernel
=
cu
.
getKernel
(
module
,
"copyForcesFromContext"
);
translateKernel
=
cu
.
getKernel
(
module
,
"applyCellTranslations"
);
}
...
...
@@ -129,19 +144,23 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr
// Apply the PILE-L thermostat.
bool
useDoublePrecision
=
(
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
());
double
dt
=
integrator
.
getStepSize
();
float
dtFloat
=
(
float
)
dt
;
void
*
dtPtr
=
(
useDoublePrecision
?
(
void
*
)
&
dt
:
(
void
*
)
&
dtFloat
);
double
kT
=
integrator
.
getTemperature
()
*
BOLTZ
;
float
kTFloat
=
(
float
)
kT
;
void
*
kTPtr
=
(
useDoublePrecision
?
(
void
*
)
&
kT
:
(
void
*
)
&
kTFloat
);
double
friction
=
integrator
.
getFriction
();
float
frictionFloat
=
(
float
)
friction
;
void
*
frictionPtr
=
(
useDoublePrecision
?
(
void
*
)
&
friction
:
(
void
*
)
&
frictionFloat
);
int
randomIndex
=
integration
.
prepareRandomNumbers
(
numParticles
*
numCopies
);
void
*
pileArgs
[]
=
{
&
velocities
->
getDevicePointer
(),
&
integration
.
getRandom
().
getDevicePointer
(),
&
randomIndex
,
&
dt
Float
,
&
kT
Float
,
&
friction
Float
};
void
*
pileArgs
[]
=
{
&
velocities
->
getDevicePointer
(),
&
integration
.
getRandom
().
getDevicePointer
(),
&
randomIndex
,
dt
Ptr
,
kT
Ptr
,
friction
Ptr
};
cu
.
executeKernel
(
pileKernel
,
pileArgs
,
numParticles
*
numCopies
,
workgroupSize
);
// Update positions and velocities.
void
*
stepArgs
[]
=
{
&
positions
->
getDevicePointer
(),
&
velocities
->
getDevicePointer
(),
&
forces
->
getDevicePointer
(),
&
dt
Float
,
&
kT
Float
};
void
*
stepArgs
[]
=
{
&
positions
->
getDevicePointer
(),
&
velocities
->
getDevicePointer
(),
&
forces
->
getDevicePointer
(),
dt
Ptr
,
kT
Ptr
};
cu
.
executeKernel
(
stepKernel
,
stepArgs
,
numParticles
*
numCopies
,
workgroupSize
);
// Calculate forces based on the updated positions.
...
...
@@ -150,7 +169,7 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr
// Update velocities.
void
*
velocitiesArgs
[]
=
{
&
velocities
->
getDevicePointer
(),
&
forces
->
getDevicePointer
(),
&
dt
Float
};
void
*
velocitiesArgs
[]
=
{
&
velocities
->
getDevicePointer
(),
&
forces
->
getDevicePointer
(),
dt
Ptr
};
cu
.
executeKernel
(
velocitiesKernel
,
velocitiesArgs
,
numParticles
*
numCopies
,
workgroupSize
);
// Apply the PILE-L thermostat again.
...
...
@@ -167,10 +186,10 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr
void
CudaIntegrateRPMDStepKernel
::
computeForces
(
ContextImpl
&
context
)
{
for
(
int
i
=
0
;
i
<
numCopies
;
i
++
)
{
void
*
copyToContextArgs
[]
=
{
&
positions
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getAtomIndexArray
().
getDevicePointer
(),
&
i
};
cu
.
executeKernel
(
copyToContextKernel
,
copyToContextArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
copy
Positions
ToContextKernel
,
copyToContextArgs
,
cu
.
getNumAtoms
());
context
.
calcForcesAndEnergy
(
true
,
false
);
void
*
copyFromContextArgs
[]
=
{
&
cu
.
getForce
().
getDevicePointer
(),
&
forces
->
getDevicePointer
(),
&
cu
.
getAtomIndexArray
().
getDevicePointer
(),
&
i
};
cu
.
executeKernel
(
copyFromContextKernel
,
copyFromContextArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
copyF
orcesF
romContextKernel
,
copyFromContextArgs
,
cu
.
getNumAtoms
());
if
(
cu
.
getAtomsWereReordered
()
&&
cu
.
getNonbondedUtilities
().
getUsePeriodic
())
{
// Atoms may have been translated into a different periodic box, so apply
// the same translation to all the beads.
...
...
@@ -190,11 +209,29 @@ void CudaIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& pos
throw
OpenMMException
(
"RPMDIntegrator: Cannot set positions before the integrator is added to a Context"
);
if
(
pos
.
size
()
!=
numParticles
)
throw
OpenMMException
(
"RPMDIntegrator: wrong number of values passed to setPositions()"
);
vector
<
float4
>
posq
(
cu
.
getPaddedNumAtoms
());
cu
.
getPosq
().
download
(
posq
);
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
posq
[
i
]
=
make_float4
((
float
)
pos
[
i
][
0
],
(
float
)
pos
[
i
][
1
],
(
float
)
pos
[
i
][
2
],
posq
[
i
].
w
);
CUresult
result
=
cuMemcpyHtoD
(
positions
->
getDevicePointer
()
+
copy
*
cu
.
getPaddedNumAtoms
()
*
sizeof
(
float4
),
&
posq
[
0
],
numParticles
*
sizeof
(
float4
));
CUresult
result
;
if
(
cu
.
getUseDoublePrecision
())
{
vector
<
double4
>
posq
(
cu
.
getPaddedNumAtoms
());
cu
.
getPosq
().
download
(
posq
);
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
posq
[
i
]
=
make_double4
(
pos
[
i
][
0
],
pos
[
i
][
1
],
pos
[
i
][
2
],
posq
[
i
].
w
);
result
=
cuMemcpyHtoD
(
positions
->
getDevicePointer
()
+
copy
*
cu
.
getPaddedNumAtoms
()
*
sizeof
(
double4
),
&
posq
[
0
],
numParticles
*
sizeof
(
double4
));
}
else
if
(
cu
.
getUseMixedPrecision
())
{
vector
<
float4
>
posqf
(
cu
.
getPaddedNumAtoms
());
cu
.
getPosq
().
download
(
posqf
);
vector
<
double4
>
posq
(
cu
.
getPaddedNumAtoms
());
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
posq
[
i
]
=
make_double4
(
pos
[
i
][
0
],
pos
[
i
][
1
],
pos
[
i
][
2
],
posqf
[
i
].
w
);
result
=
cuMemcpyHtoD
(
positions
->
getDevicePointer
()
+
copy
*
cu
.
getPaddedNumAtoms
()
*
sizeof
(
double4
),
&
posq
[
0
],
numParticles
*
sizeof
(
double4
));
}
else
{
vector
<
float4
>
posq
(
cu
.
getPaddedNumAtoms
());
cu
.
getPosq
().
download
(
posq
);
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
posq
[
i
]
=
make_float4
((
float
)
pos
[
i
][
0
],
(
float
)
pos
[
i
][
1
],
(
float
)
pos
[
i
][
2
],
posq
[
i
].
w
);
result
=
cuMemcpyHtoD
(
positions
->
getDevicePointer
()
+
copy
*
cu
.
getPaddedNumAtoms
()
*
sizeof
(
float4
),
&
posq
[
0
],
numParticles
*
sizeof
(
float4
));
}
if
(
result
!=
CUDA_SUCCESS
)
{
std
::
stringstream
str
;
str
<<
"Error uploading array "
<<
positions
->
getName
()
<<
": "
<<
CudaContext
::
getErrorString
(
result
)
<<
" ("
<<
result
<<
")"
;
...
...
@@ -207,11 +244,21 @@ void CudaIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& ve
throw
OpenMMException
(
"RPMDIntegrator: Cannot set velocities before the integrator is added to a Context"
);
if
(
vel
.
size
()
!=
numParticles
)
throw
OpenMMException
(
"RPMDIntegrator: wrong number of values passed to setVelocities()"
);
vector
<
float4
>
velm
(
cu
.
getPaddedNumAtoms
());
cu
.
getVelm
().
download
(
velm
);
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
velm
[
i
]
=
make_float4
((
float
)
vel
[
i
][
0
],
(
float
)
vel
[
i
][
1
],
(
float
)
vel
[
i
][
2
],
velm
[
i
].
w
);
CUresult
result
=
cuMemcpyHtoD
(
velocities
->
getDevicePointer
()
+
copy
*
cu
.
getPaddedNumAtoms
()
*
sizeof
(
float4
),
&
velm
[
0
],
numParticles
*
sizeof
(
float4
));
CUresult
result
;
if
(
cu
.
getUseDoublePrecision
()
||
cu
.
getUseMixedPrecision
())
{
vector
<
double4
>
velm
(
cu
.
getPaddedNumAtoms
());
cu
.
getVelm
().
download
(
velm
);
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
velm
[
i
]
=
make_double4
(
vel
[
i
][
0
],
vel
[
i
][
1
],
vel
[
i
][
2
],
velm
[
i
].
w
);
result
=
cuMemcpyHtoD
(
velocities
->
getDevicePointer
()
+
copy
*
cu
.
getPaddedNumAtoms
()
*
sizeof
(
double4
),
&
velm
[
0
],
numParticles
*
sizeof
(
double4
));
}
else
{
vector
<
float4
>
velm
(
cu
.
getPaddedNumAtoms
());
cu
.
getVelm
().
download
(
velm
);
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
velm
[
i
]
=
make_float4
((
float
)
vel
[
i
][
0
],
(
float
)
vel
[
i
][
1
],
(
float
)
vel
[
i
][
2
],
velm
[
i
].
w
);
result
=
cuMemcpyHtoD
(
velocities
->
getDevicePointer
()
+
copy
*
cu
.
getPaddedNumAtoms
()
*
sizeof
(
float4
),
&
velm
[
0
],
numParticles
*
sizeof
(
float4
));
}
if
(
result
!=
CUDA_SUCCESS
)
{
std
::
stringstream
str
;
str
<<
"Error uploading array "
<<
velocities
->
getName
()
<<
": "
<<
CudaContext
::
getErrorString
(
result
)
<<
" ("
<<
result
<<
")"
;
...
...
@@ -221,9 +268,9 @@ void CudaIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& ve
void
CudaIntegrateRPMDStepKernel
::
copyToContext
(
int
copy
,
ContextImpl
&
context
)
{
void
*
copyPositionsArgs
[]
=
{
&
positions
->
getDevicePointer
(),
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getAtomIndexArray
().
getDevicePointer
(),
&
copy
};
cu
.
executeKernel
(
copyToContextKernel
,
copyPositionsArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
copy
Positions
ToContextKernel
,
copyPositionsArgs
,
cu
.
getNumAtoms
());
void
*
copyVelocitiesArgs
[]
=
{
&
velocities
->
getDevicePointer
(),
&
cu
.
getVelm
().
getDevicePointer
(),
&
cu
.
getAtomIndexArray
().
getDevicePointer
(),
&
copy
};
cu
.
executeKernel
(
copyToContextKernel
,
copyVelocitiesArgs
,
cu
.
getNumAtoms
());
cu
.
executeKernel
(
copy
Velocities
ToContextKernel
,
copyVelocitiesArgs
,
cu
.
getNumAtoms
());
}
string
CudaIntegrateRPMDStepKernel
::
createFFT
(
int
size
,
const
string
&
variable
,
bool
forward
)
{
...
...
@@ -237,10 +284,10 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
string
multImag
=
(
forward
?
"multiplyComplexImagPart"
:
"multiplyComplexImagPartConj"
);
source
<<
"{
\n
"
;
source
<<
"
float
3* real0 = "
<<
variable
<<
"real;
\n
"
;
source
<<
"
float
3* imag0 = "
<<
variable
<<
"imag;
\n
"
;
source
<<
"
float
3* real1 = &temp[blockStart];
\n
"
;
source
<<
"
float
3* imag1 = &temp[blockStart+blockDim.x];
\n
"
;
source
<<
"
mixed
3* real0 = "
<<
variable
<<
"real;
\n
"
;
source
<<
"
mixed
3* imag0 = "
<<
variable
<<
"imag;
\n
"
;
source
<<
"
mixed
3* real1 = &temp[blockStart];
\n
"
;
source
<<
"
mixed
3* imag1 = &temp[blockStart+blockDim.x];
\n
"
;
// Factor size, generating an appropriate block of code for each factor.
...
...
@@ -254,39 +301,39 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
source
<<
"if (indexInBlock < "
<<
(
L
*
m
)
<<
") {
\n
"
;
source
<<
"int i = indexInBlock;
\n
"
;
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
source
<<
"
float
3 c0r = real"
<<
input
<<
"[i];
\n
"
;
source
<<
"
float
3 c0i = imag"
<<
input
<<
"[i];
\n
"
;
source
<<
"
float
3 c1r = real"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
3 c1i = imag"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
3 c2r = real"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
3 c2i = imag"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
3 c3r = real"
<<
input
<<
"[i+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
3 c3i = imag"
<<
input
<<
"[i+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
3 c4r = real"
<<
input
<<
"[i+"
<<
(
4
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
3 c4i = imag"
<<
input
<<
"[i+"
<<
(
4
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
3 d0r = c1r+c4r;
\n
"
;
source
<<
"
float
3 d0i = c1i+c4i;
\n
"
;
source
<<
"
float
3 d1r = c2r+c3r;
\n
"
;
source
<<
"
float
3 d1i = c2i+c3i;
\n
"
;
source
<<
"
float
3 d2r = "
<<
cu
.
doubleToString
(
sin
(
0.4
*
M_PI
))
<<
"*(c1r-c4r);
\n
"
;
source
<<
"
float
3 d2i = "
<<
cu
.
doubleToString
(
sin
(
0.4
*
M_PI
))
<<
"*(c1i-c4i);
\n
"
;
source
<<
"
float
3 d3r = "
<<
cu
.
doubleToString
(
sin
(
0.4
*
M_PI
))
<<
"*(c2r-c3r);
\n
"
;
source
<<
"
float
3 d3i = "
<<
cu
.
doubleToString
(
sin
(
0.4
*
M_PI
))
<<
"*(c2i-c3i);
\n
"
;
source
<<
"
float
3 d4r = d0r+d1r;
\n
"
;
source
<<
"
float
3 d4i = d0i+d1i;
\n
"
;
source
<<
"
float
3 d5r = "
<<
cu
.
doubleToString
(
0.25
*
sqrt
(
5.0
))
<<
"*(d0r-d1r);
\n
"
;
source
<<
"
float
3 d5i = "
<<
cu
.
doubleToString
(
0.25
*
sqrt
(
5.0
))
<<
"*(d0i-d1i);
\n
"
;
source
<<
"
float
3 d6r = c0r-0.25f*d4r;
\n
"
;
source
<<
"
float
3 d6i = c0i-0.25f*d4i;
\n
"
;
source
<<
"
float
3 d7r = d6r+d5r;
\n
"
;
source
<<
"
float
3 d7i = d6i+d5i;
\n
"
;
source
<<
"
float
3 d8r = d6r-d5r;
\n
"
;
source
<<
"
float
3 d8i = d6i-d5i;
\n
"
;
source
<<
"
mixed
3 c0r = real"
<<
input
<<
"[i];
\n
"
;
source
<<
"
mixed
3 c0i = imag"
<<
input
<<
"[i];
\n
"
;
source
<<
"
mixed
3 c1r = real"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
3 c1i = imag"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
3 c2r = real"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
3 c2i = imag"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
3 c3r = real"
<<
input
<<
"[i+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
3 c3i = imag"
<<
input
<<
"[i+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
3 c4r = real"
<<
input
<<
"[i+"
<<
(
4
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
3 c4i = imag"
<<
input
<<
"[i+"
<<
(
4
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
3 d0r = c1r+c4r;
\n
"
;
source
<<
"
mixed
3 d0i = c1i+c4i;
\n
"
;
source
<<
"
mixed
3 d1r = c2r+c3r;
\n
"
;
source
<<
"
mixed
3 d1i = c2i+c3i;
\n
"
;
source
<<
"
mixed
3 d2r = "
<<
cu
.
doubleToString
(
sin
(
0.4
*
M_PI
))
<<
"*(c1r-c4r);
\n
"
;
source
<<
"
mixed
3 d2i = "
<<
cu
.
doubleToString
(
sin
(
0.4
*
M_PI
))
<<
"*(c1i-c4i);
\n
"
;
source
<<
"
mixed
3 d3r = "
<<
cu
.
doubleToString
(
sin
(
0.4
*
M_PI
))
<<
"*(c2r-c3r);
\n
"
;
source
<<
"
mixed
3 d3i = "
<<
cu
.
doubleToString
(
sin
(
0.4
*
M_PI
))
<<
"*(c2i-c3i);
\n
"
;
source
<<
"
mixed
3 d4r = d0r+d1r;
\n
"
;
source
<<
"
mixed
3 d4i = d0i+d1i;
\n
"
;
source
<<
"
mixed
3 d5r = "
<<
cu
.
doubleToString
(
0.25
*
sqrt
(
5.0
))
<<
"*(d0r-d1r);
\n
"
;
source
<<
"
mixed
3 d5i = "
<<
cu
.
doubleToString
(
0.25
*
sqrt
(
5.0
))
<<
"*(d0i-d1i);
\n
"
;
source
<<
"
mixed
3 d6r = c0r-0.25f*d4r;
\n
"
;
source
<<
"
mixed
3 d6i = c0i-0.25f*d4i;
\n
"
;
source
<<
"
mixed
3 d7r = d6r+d5r;
\n
"
;
source
<<
"
mixed
3 d7i = d6i+d5i;
\n
"
;
source
<<
"
mixed
3 d8r = d6r-d5r;
\n
"
;
source
<<
"
mixed
3 d8i = d6i-d5i;
\n
"
;
string
coeff
=
cu
.
doubleToString
(
sin
(
0.2
*
M_PI
)
/
sin
(
0.4
*
M_PI
));
source
<<
"
float
3 d9r = "
<<
sign
<<
"*(d2i+"
<<
coeff
<<
"*d3i);
\n
"
;
source
<<
"
float
3 d9i = "
<<
sign
<<
"*(-d2r-"
<<
coeff
<<
"*d3r);
\n
"
;
source
<<
"
float
3 d10r = "
<<
sign
<<
"*("
<<
coeff
<<
"*d2i-d3i);
\n
"
;
source
<<
"
float
3 d10i = "
<<
sign
<<
"*(d3r-"
<<
coeff
<<
"*d2r);
\n
"
;
source
<<
"
mixed
3 d9r = "
<<
sign
<<
"*(d2i+"
<<
coeff
<<
"*d3i);
\n
"
;
source
<<
"
mixed
3 d9i = "
<<
sign
<<
"*(-d2r-"
<<
coeff
<<
"*d3r);
\n
"
;
source
<<
"
mixed
3 d10r = "
<<
sign
<<
"*("
<<
coeff
<<
"*d2i-d3i);
\n
"
;
source
<<
"
mixed
3 d10i = "
<<
sign
<<
"*(d3r-"
<<
coeff
<<
"*d2r);
\n
"
;
source
<<
"real"
<<
output
<<
"[i+4*j*"
<<
m
<<
"] = c0r+d4r;
\n
"
;
source
<<
"imag"
<<
output
<<
"[i+4*j*"
<<
m
<<
"] = c0i+d4i;
\n
"
;
source
<<
"real"
<<
output
<<
"[i+(4*j+1)*"
<<
m
<<
"] = "
<<
multReal
<<
"(w[j*"
<<
size
<<
"/"
<<
(
5
*
L
)
<<
"], d7r+d9r, d7i+d9i);
\n
"
;
...
...
@@ -307,22 +354,22 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
source
<<
"if (indexInBlock < "
<<
(
L
*
m
)
<<
") {
\n
"
;
source
<<
"int i = indexInBlock;
\n
"
;
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
source
<<
"
float
3 c0r = real"
<<
input
<<
"[i];
\n
"
;
source
<<
"
float
3 c0i = imag"
<<
input
<<
"[i];
\n
"
;
source
<<
"
float
3 c1r = real"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
3 c1i = imag"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
3 c2r = real"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
3 c2i = imag"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
3 c3r = real"
<<
input
<<
"[i+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
3 c3i = imag"
<<
input
<<
"[i+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
3 d0r = c0r+c2r;
\n
"
;
source
<<
"
float
3 d0i = c0i+c2i;
\n
"
;
source
<<
"
float
3 d1r = c0r-c2r;
\n
"
;
source
<<
"
float
3 d1i = c0i-c2i;
\n
"
;
source
<<
"
float
3 d2r = c1r+c3r;
\n
"
;
source
<<
"
float
3 d2i = c1i+c3i;
\n
"
;
source
<<
"
float
3 d3r = "
<<
sign
<<
"*(c1i-c3i);
\n
"
;
source
<<
"
float
3 d3i = "
<<
sign
<<
"*(c3r-c1r);
\n
"
;
source
<<
"
mixed
3 c0r = real"
<<
input
<<
"[i];
\n
"
;
source
<<
"
mixed
3 c0i = imag"
<<
input
<<
"[i];
\n
"
;
source
<<
"
mixed
3 c1r = real"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
3 c1i = imag"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
3 c2r = real"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
3 c2i = imag"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
3 c3r = real"
<<
input
<<
"[i+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
3 c3i = imag"
<<
input
<<
"[i+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
3 d0r = c0r+c2r;
\n
"
;
source
<<
"
mixed
3 d0i = c0i+c2i;
\n
"
;
source
<<
"
mixed
3 d1r = c0r-c2r;
\n
"
;
source
<<
"
mixed
3 d1i = c0i-c2i;
\n
"
;
source
<<
"
mixed
3 d2r = c1r+c3r;
\n
"
;
source
<<
"
mixed
3 d2i = c1i+c3i;
\n
"
;
source
<<
"
mixed
3 d3r = "
<<
sign
<<
"*(c1i-c3i);
\n
"
;
source
<<
"
mixed
3 d3i = "
<<
sign
<<
"*(c3r-c1r);
\n
"
;
source
<<
"real"
<<
output
<<
"[i+3*j*"
<<
m
<<
"] = d0r+d2r;
\n
"
;
source
<<
"imag"
<<
output
<<
"[i+3*j*"
<<
m
<<
"] = d0i+d2i;
\n
"
;
source
<<
"real"
<<
output
<<
"[i+(3*j+1)*"
<<
m
<<
"] = "
<<
multReal
<<
"(w[j*"
<<
size
<<
"/"
<<
(
4
*
L
)
<<
"], d1r+d3r, d1i+d3i);
\n
"
;
...
...
@@ -341,18 +388,18 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
source
<<
"if (indexInBlock < "
<<
(
L
*
m
)
<<
") {
\n
"
;
source
<<
"int i = indexInBlock;
\n
"
;
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
source
<<
"
float
3 c0r = real"
<<
input
<<
"[i];
\n
"
;
source
<<
"
float
3 c0i = imag"
<<
input
<<
"[i];
\n
"
;
source
<<
"
float
3 c1r = real"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
3 c1i = imag"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
3 c2r = real"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
3 c2i = imag"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
3 d0r = c1r+c2r;
\n
"
;
source
<<
"
float
3 d0i = c1i+c2i;
\n
"
;
source
<<
"
float
3 d1r = c0r-0.5f*d0r;
\n
"
;
source
<<
"
float
3 d1i = c0i-0.5f*d0i;
\n
"
;
source
<<
"
float
3 d2r = "
<<
sign
<<
"*"
<<
cu
.
doubleToString
(
sin
(
M_PI
/
3.0
))
<<
"*(c1i-c2i);
\n
"
;
source
<<
"
float
3 d2i = "
<<
sign
<<
"*"
<<
cu
.
doubleToString
(
sin
(
M_PI
/
3.0
))
<<
"*(c2r-c1r);
\n
"
;
source
<<
"
mixed
3 c0r = real"
<<
input
<<
"[i];
\n
"
;
source
<<
"
mixed
3 c0i = imag"
<<
input
<<
"[i];
\n
"
;
source
<<
"
mixed
3 c1r = real"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
3 c1i = imag"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
3 c2r = real"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
3 c2i = imag"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
3 d0r = c1r+c2r;
\n
"
;
source
<<
"
mixed
3 d0i = c1i+c2i;
\n
"
;
source
<<
"
mixed
3 d1r = c0r-0.5f*d0r;
\n
"
;
source
<<
"
mixed
3 d1i = c0i-0.5f*d0i;
\n
"
;
source
<<
"
mixed
3 d2r = "
<<
sign
<<
"*"
<<
cu
.
doubleToString
(
sin
(
M_PI
/
3.0
))
<<
"*(c1i-c2i);
\n
"
;
source
<<
"
mixed
3 d2i = "
<<
sign
<<
"*"
<<
cu
.
doubleToString
(
sin
(
M_PI
/
3.0
))
<<
"*(c2r-c1r);
\n
"
;
source
<<
"real"
<<
output
<<
"[i+2*j*"
<<
m
<<
"] = c0r+d0r;
\n
"
;
source
<<
"imag"
<<
output
<<
"[i+2*j*"
<<
m
<<
"] = c0i+d0i;
\n
"
;
source
<<
"real"
<<
output
<<
"[i+(2*j+1)*"
<<
m
<<
"] = "
<<
multReal
<<
"(w[j*"
<<
size
<<
"/"
<<
(
3
*
L
)
<<
"], d1r+d2r, d1i+d2i);
\n
"
;
...
...
@@ -369,10 +416,10 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
source
<<
"if (indexInBlock < "
<<
(
L
*
m
)
<<
") {
\n
"
;
source
<<
"int i = indexInBlock;
\n
"
;
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
source
<<
"
float
3 c0r = real"
<<
input
<<
"[i];
\n
"
;
source
<<
"
float
3 c0i = imag"
<<
input
<<
"[i];
\n
"
;
source
<<
"
float
3 c1r = real"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
3 c1i = imag"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
3 c0r = real"
<<
input
<<
"[i];
\n
"
;
source
<<
"
mixed
3 c0i = imag"
<<
input
<<
"[i];
\n
"
;
source
<<
"
mixed
3 c1r = real"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
3 c1i = imag"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"real"
<<
output
<<
"[i+j*"
<<
m
<<
"] = c0r+c1r;
\n
"
;
source
<<
"imag"
<<
output
<<
"[i+j*"
<<
m
<<
"] = c0i+c1i;
\n
"
;
source
<<
"real"
<<
output
<<
"[i+(j+1)*"
<<
m
<<
"] = "
<<
multReal
<<
"(w[j*"
<<
size
<<
"/"
<<
(
2
*
L
)
<<
"], c0r-c1r, c0i-c1i);
\n
"
;
...
...
plugins/rpmd/platforms/cuda/src/CudaRpmdKernels.h
View file @
cd6af26e
...
...
@@ -91,7 +91,7 @@ private:
CudaArray
*
forces
;
CudaArray
*
positions
;
CudaArray
*
velocities
;
CUfunction
pileKernel
,
stepKernel
,
velocitiesKernel
,
copyToContextKernel
,
copyFromContextKernel
,
translateKernel
;
CUfunction
pileKernel
,
stepKernel
,
velocitiesKernel
,
copy
Positions
ToContextKernel
,
copy
VelocitiesToContextKernel
,
copyForces
FromContextKernel
,
translateKernel
;
};
}
// namespace OpenMM
...
...
plugins/rpmd/platforms/cuda/src/kernels/rpmd.cu
View file @
cd6af26e
__device__
float
3
multiplyComplexRealPart
(
float
2
c1
,
float
3
c2r
,
float
3
c2i
)
{
__device__
mixed
3
multiplyComplexRealPart
(
mixed
2
c1
,
mixed
3
c2r
,
mixed
3
c2i
)
{
return
c1
.
x
*
c2r
-
c1
.
y
*
c2i
;
}
__device__
float
3
multiplyComplexImagPart
(
float
2
c1
,
float
3
c2r
,
float
3
c2i
)
{
__device__
mixed
3
multiplyComplexImagPart
(
mixed
2
c1
,
mixed
3
c2r
,
mixed
3
c2i
)
{
return
c1
.
x
*
c2i
+
c1
.
y
*
c2r
;
}
__device__
float
3
multiplyComplexRealPartConj
(
float
2
c1
,
float
3
c2r
,
float
3
c2i
)
{
__device__
mixed
3
multiplyComplexRealPartConj
(
mixed
2
c1
,
mixed
3
c2r
,
mixed
3
c2i
)
{
return
c1
.
x
*
c2r
+
c1
.
y
*
c2i
;
}
__device__
float
3
multiplyComplexImagPartConj
(
float
2
c1
,
float
3
c2r
,
float
3
c2i
)
{
__device__
mixed
3
multiplyComplexImagPartConj
(
mixed
2
c1
,
mixed
3
c2r
,
mixed
3
c2i
)
{
return
c1
.
x
*
c2i
-
c1
.
y
*
c2r
;
}
/**
* Apply the PILE-L thermostat.
*/
extern
"C"
__global__
void
applyPileThermostat
(
float
4
*
velm
,
float4
*
random
,
unsigned
int
randomIndex
,
float
dt
,
float
kT
,
float
friction
)
{
extern
"C"
__global__
void
applyPileThermostat
(
mixed
4
*
velm
,
float4
*
random
,
unsigned
int
randomIndex
,
mixed
dt
,
mixed
kT
,
mixed
friction
)
{
const
int
numBlocks
=
blockDim
.
x
*
gridDim
.
x
/
NUM_COPIES
;
const
int
blockStart
=
NUM_COPIES
*
(
threadIdx
.
x
/
NUM_COPIES
);
const
int
indexInBlock
=
threadIdx
.
x
-
blockStart
;
const
float
nkT
=
NUM_COPIES
*
kT
;
const
float
twown
=
2.0
f
*
nkT
/
HBAR
;
const
float
c1_0
=
EXP
(
-
0.5
f
*
dt
*
friction
);
const
float
c2_0
=
SQRT
(
1.0
f
-
c1_0
*
c1_0
);
__shared__
float
3
v
[
2
*
THREAD_BLOCK_SIZE
];
__shared__
float
3
temp
[
2
*
THREAD_BLOCK_SIZE
];
__shared__
float
2
w
[
NUM_COPIES
];
float
3
*
vreal
=
&
v
[
blockStart
];
float
3
*
vimag
=
&
v
[
blockStart
+
blockDim
.
x
];
const
mixed
nkT
=
NUM_COPIES
*
kT
;
const
mixed
twown
=
2.0
f
*
nkT
/
HBAR
;
const
mixed
c1_0
=
EXP
(
-
0.5
f
*
dt
*
friction
);
const
mixed
c2_0
=
SQRT
(
1.0
f
-
c1_0
*
c1_0
);
__shared__
mixed
3
v
[
2
*
THREAD_BLOCK_SIZE
];
__shared__
mixed
3
temp
[
2
*
THREAD_BLOCK_SIZE
];
__shared__
mixed
2
w
[
NUM_COPIES
];
mixed
3
*
vreal
=
&
v
[
blockStart
];
mixed
3
*
vimag
=
&
v
[
blockStart
+
blockDim
.
x
];
if
(
threadIdx
.
x
<
NUM_COPIES
)
w
[
indexInBlock
]
=
make_
float
2
(
cos
(
-
indexInBlock
*
2
*
M_PI
/
NUM_COPIES
),
sin
(
-
indexInBlock
*
2
*
M_PI
/
NUM_COPIES
));
w
[
indexInBlock
]
=
make_
mixed
2
(
cos
(
-
indexInBlock
*
2
*
M_PI
/
NUM_COPIES
),
sin
(
-
indexInBlock
*
2
*
M_PI
/
NUM_COPIES
));
__syncthreads
();
randomIndex
+=
NUM_COPIES
*
((
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
NUM_COPIES
);
for
(
int
particle
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
NUM_COPIES
;
particle
<
NUM_ATOMS
;
particle
+=
numBlocks
)
{
float
4
particleVelm
=
velm
[
particle
+
indexInBlock
*
PADDED_NUM_ATOMS
];
float
invMass
=
particleVelm
.
w
;
float
c3_0
=
c2_0
*
SQRT
(
nkT
*
invMass
);
mixed
4
particleVelm
=
velm
[
particle
+
indexInBlock
*
PADDED_NUM_ATOMS
];
mixed
invMass
=
particleVelm
.
w
;
mixed
c3_0
=
c2_0
*
SQRT
(
nkT
*
invMass
);
// Forward FFT.
vreal
[
indexInBlock
]
=
SCALE
*
make_
float
3
(
particleVelm
.
x
,
particleVelm
.
y
,
particleVelm
.
z
);
vimag
[
indexInBlock
]
=
make_
float
3
(
0
);
vreal
[
indexInBlock
]
=
SCALE
*
make_
mixed
3
(
particleVelm
.
x
,
particleVelm
.
y
,
particleVelm
.
z
);
vimag
[
indexInBlock
]
=
make_
mixed
3
(
0
);
__syncthreads
();
FFT_V_FORWARD
...
...
@@ -53,28 +53,28 @@ extern "C" __global__ void applyPileThermostat(float4* velm, float4* random, uns
// Apply a local Langevin thermostat to the centroid mode.
float4
rand
=
random
[
randomIndex
];
vreal
[
0
]
=
vreal
[
0
]
*
c1_0
+
c3_0
*
make_
float
3
(
rand
.
x
,
rand
.
y
,
rand
.
z
);
vreal
[
0
]
=
vreal
[
0
]
*
c1_0
+
c3_0
*
make_
mixed
3
(
rand
.
x
,
rand
.
y
,
rand
.
z
);
}
else
{
// Use critical damping white noise for the remaining modes.
int
k
=
(
indexInBlock
<=
NUM_COPIES
/
2
?
indexInBlock
:
NUM_COPIES
-
indexInBlock
);
const
bool
isCenter
=
(
NUM_COPIES
%
2
==
0
&&
k
==
NUM_COPIES
/
2
);
const
float
wk
=
twown
*
sin
(
k
*
M_PI
/
NUM_COPIES
);
const
float
c1
=
EXP
(
-
wk
*
dt
);
const
float
c2
=
SQRT
((
1.0
f
-
c1
*
c1
)
/
2.0
f
)
*
(
isCenter
?
sqrt
(
2.0
f
)
:
1.0
f
);
const
float
c3
=
c2
*
SQRT
(
nkT
*
invMass
);
float4
rand1
=
c3
*
random
[
randomIndex
+
k
];
float4
rand2
=
(
isCenter
?
make_float4
(
0
)
:
c3
*
random
[
randomIndex
+
NUM_COPIES
-
k
]);
vreal
[
indexInBlock
]
=
c1
*
vreal
[
indexInBlock
]
+
make_
float
3
(
rand1
.
x
,
rand1
.
y
,
rand1
.
z
);
vimag
[
indexInBlock
]
=
c1
*
vimag
[
indexInBlock
]
+
(
indexInBlock
<
NUM_COPIES
/
2
?
make_
float
3
(
rand2
.
x
,
rand2
.
y
,
rand2
.
z
)
:
make_
float
3
(
-
rand2
.
x
,
-
rand2
.
y
,
-
rand2
.
z
));
const
mixed
wk
=
twown
*
sin
(
k
*
M_PI
/
NUM_COPIES
);
const
mixed
c1
=
EXP
(
-
wk
*
dt
);
const
mixed
c2
=
SQRT
((
1.0
f
-
c1
*
c1
)
/
2.0
f
)
*
(
isCenter
?
sqrt
(
2.0
f
)
:
1.0
f
);
const
mixed
c3
=
c2
*
SQRT
(
nkT
*
invMass
);
float4
rand1
=
random
[
randomIndex
+
k
];
float4
rand2
=
(
isCenter
?
make_float4
(
0
)
:
random
[
randomIndex
+
NUM_COPIES
-
k
]);
vreal
[
indexInBlock
]
=
c1
*
vreal
[
indexInBlock
]
+
c3
*
make_
mixed
3
(
rand1
.
x
,
rand1
.
y
,
rand1
.
z
);
vimag
[
indexInBlock
]
=
c1
*
vimag
[
indexInBlock
]
+
c3
*
(
indexInBlock
<
NUM_COPIES
/
2
?
make_
mixed
3
(
rand2
.
x
,
rand2
.
y
,
rand2
.
z
)
:
make_
mixed
3
(
-
rand2
.
x
,
-
rand2
.
y
,
-
rand2
.
z
));
}
__syncthreads
();
// Inverse FFT.
FFT_V_BACKWARD
velm
[
particle
+
indexInBlock
*
PADDED_NUM_ATOMS
]
=
make_
float
4
(
SCALE
*
vreal
[
indexInBlock
].
x
,
SCALE
*
vreal
[
indexInBlock
].
y
,
SCALE
*
vreal
[
indexInBlock
].
z
,
particleVelm
.
w
);
velm
[
particle
+
indexInBlock
*
PADDED_NUM_ATOMS
]
=
make_
mixed
4
(
SCALE
*
vreal
[
indexInBlock
].
x
,
SCALE
*
vreal
[
indexInBlock
].
y
,
SCALE
*
vreal
[
indexInBlock
].
z
,
particleVelm
.
w
);
randomIndex
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
...
...
@@ -82,24 +82,24 @@ extern "C" __global__ void applyPileThermostat(float4* velm, float4* random, uns
/**
* Advance the positions and velocities.
*/
extern
"C"
__global__
void
integrateStep
(
float
4
*
posq
,
float
4
*
velm
,
long
long
*
force
,
float
dt
,
float
kT
)
{
extern
"C"
__global__
void
integrateStep
(
mixed
4
*
posq
,
mixed
4
*
velm
,
long
long
*
force
,
mixed
dt
,
mixed
kT
)
{
const
int
numBlocks
=
(
blockDim
.
x
*
gridDim
.
x
)
/
NUM_COPIES
;
const
int
blockStart
=
NUM_COPIES
*
(
threadIdx
.
x
/
NUM_COPIES
);
const
int
indexInBlock
=
threadIdx
.
x
-
blockStart
;
const
float
nkT
=
NUM_COPIES
*
kT
;
const
float
twown
=
2.0
f
*
nkT
/
HBAR
;
const
float
forceScale
=
1
/
(
float
)
0xFFFFFFFF
;
__shared__
float
3
q
[
2
*
THREAD_BLOCK_SIZE
];
__shared__
float
3
v
[
2
*
THREAD_BLOCK_SIZE
];
__shared__
float
3
temp
[
2
*
THREAD_BLOCK_SIZE
];
__shared__
float
2
w
[
NUM_COPIES
];
const
mixed
nkT
=
NUM_COPIES
*
kT
;
const
mixed
twown
=
2.0
f
*
nkT
/
HBAR
;
const
mixed
forceScale
=
1
/
(
mixed
)
0xFFFFFFFF
;
__shared__
mixed
3
q
[
2
*
THREAD_BLOCK_SIZE
];
__shared__
mixed
3
v
[
2
*
THREAD_BLOCK_SIZE
];
__shared__
mixed
3
temp
[
2
*
THREAD_BLOCK_SIZE
];
__shared__
mixed
2
w
[
NUM_COPIES
];
// Update velocities.
for
(
int
particle
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
NUM_COPIES
;
particle
<
NUM_ATOMS
;
particle
+=
numBlocks
)
{
int
index
=
particle
+
indexInBlock
*
PADDED_NUM_ATOMS
;
int
forceIndex
=
particle
+
indexInBlock
*
PADDED_NUM_ATOMS
*
3
;
float
4
particleVelm
=
velm
[
index
];
mixed
4
particleVelm
=
velm
[
index
];
particleVelm
.
x
+=
forceScale
*
force
[
forceIndex
]
*
(
0.5
f
*
dt
*
particleVelm
.
w
);
particleVelm
.
y
+=
forceScale
*
force
[
forceIndex
+
PADDED_NUM_ATOMS
]
*
(
0.5
f
*
dt
*
particleVelm
.
w
);
particleVelm
.
z
+=
forceScale
*
force
[
forceIndex
+
PADDED_NUM_ATOMS
*
2
]
*
(
0.5
f
*
dt
*
particleVelm
.
w
);
...
...
@@ -108,23 +108,23 @@ extern "C" __global__ void integrateStep(float4* posq, float4* velm, long long*
// Evolve the free ring polymer by transforming to the frequency domain.
float
3
*
qreal
=
&
q
[
blockStart
];
float
3
*
qimag
=
&
q
[
blockStart
+
blockDim
.
x
];
float
3
*
vreal
=
&
v
[
blockStart
];
float
3
*
vimag
=
&
v
[
blockStart
+
blockDim
.
x
];
mixed
3
*
qreal
=
&
q
[
blockStart
];
mixed
3
*
qimag
=
&
q
[
blockStart
+
blockDim
.
x
];
mixed
3
*
vreal
=
&
v
[
blockStart
];
mixed
3
*
vimag
=
&
v
[
blockStart
+
blockDim
.
x
];
if
(
threadIdx
.
x
<
NUM_COPIES
)
w
[
indexInBlock
]
=
make_
float
2
(
cos
(
-
indexInBlock
*
2
*
M_PI
/
NUM_COPIES
),
sin
(
-
indexInBlock
*
2
*
M_PI
/
NUM_COPIES
));
w
[
indexInBlock
]
=
make_
mixed
2
(
cos
(
-
indexInBlock
*
2
*
M_PI
/
NUM_COPIES
),
sin
(
-
indexInBlock
*
2
*
M_PI
/
NUM_COPIES
));
__syncthreads
();
for
(
int
particle
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
NUM_COPIES
;
particle
<
NUM_ATOMS
;
particle
+=
numBlocks
)
{
float
4
particlePosq
=
posq
[
particle
+
indexInBlock
*
PADDED_NUM_ATOMS
];
float
4
particleVelm
=
velm
[
particle
+
indexInBlock
*
PADDED_NUM_ATOMS
];
mixed
4
particlePosq
=
posq
[
particle
+
indexInBlock
*
PADDED_NUM_ATOMS
];
mixed
4
particleVelm
=
velm
[
particle
+
indexInBlock
*
PADDED_NUM_ATOMS
];
// Forward FFT.
qreal
[
indexInBlock
]
=
SCALE
*
make_
float
3
(
particlePosq
.
x
,
particlePosq
.
y
,
particlePosq
.
z
);
qimag
[
indexInBlock
]
=
make_
float
3
(
0
);
vreal
[
indexInBlock
]
=
SCALE
*
make_
float
3
(
particleVelm
.
x
,
particleVelm
.
y
,
particleVelm
.
z
);
vimag
[
indexInBlock
]
=
make_
float
3
(
0
);
qreal
[
indexInBlock
]
=
SCALE
*
make_
mixed
3
(
particlePosq
.
x
,
particlePosq
.
y
,
particlePosq
.
z
);
qimag
[
indexInBlock
]
=
make_
mixed
3
(
0
);
vreal
[
indexInBlock
]
=
SCALE
*
make_
mixed
3
(
particleVelm
.
x
,
particleVelm
.
y
,
particleVelm
.
z
);
vimag
[
indexInBlock
]
=
make_
mixed
3
(
0
);
__syncthreads
();
FFT_Q_FORWARD
FFT_V_FORWARD
...
...
@@ -136,12 +136,12 @@ extern "C" __global__ void integrateStep(float4* posq, float4* velm, long long*
qimag
[
0
]
+=
vimag
[
0
]
*
dt
;
}
else
{
const
float
wk
=
twown
*
sin
(
indexInBlock
*
M_PI
/
NUM_COPIES
);
const
float
wt
=
wk
*
dt
;
const
float
coswt
=
cos
(
wt
);
const
float
sinwt
=
sin
(
wt
);
const
float
3
vprimereal
=
vreal
[
indexInBlock
]
*
coswt
-
qreal
[
indexInBlock
]
*
(
wk
*
sinwt
);
// Advance velocity from t to t+dt
const
float
3
vprimeimag
=
vimag
[
indexInBlock
]
*
coswt
-
qimag
[
indexInBlock
]
*
(
wk
*
sinwt
);
const
mixed
wk
=
twown
*
sin
(
indexInBlock
*
M_PI
/
NUM_COPIES
);
const
mixed
wt
=
wk
*
dt
;
const
mixed
coswt
=
cos
(
wt
);
const
mixed
sinwt
=
sin
(
wt
);
const
mixed
3
vprimereal
=
vreal
[
indexInBlock
]
*
coswt
-
qreal
[
indexInBlock
]
*
(
wk
*
sinwt
);
// Advance velocity from t to t+dt
const
mixed
3
vprimeimag
=
vimag
[
indexInBlock
]
*
coswt
-
qimag
[
indexInBlock
]
*
(
wk
*
sinwt
);
qreal
[
indexInBlock
]
=
vreal
[
indexInBlock
]
*
(
sinwt
/
wk
)
+
qreal
[
indexInBlock
]
*
coswt
;
// Advance position from t to t+dt
qimag
[
indexInBlock
]
=
vimag
[
indexInBlock
]
*
(
sinwt
/
wk
)
+
qimag
[
indexInBlock
]
*
coswt
;
vreal
[
indexInBlock
]
=
vprimereal
;
...
...
@@ -153,26 +153,26 @@ extern "C" __global__ void integrateStep(float4* posq, float4* velm, long long*
FFT_Q_BACKWARD
FFT_V_BACKWARD
posq
[
particle
+
indexInBlock
*
PADDED_NUM_ATOMS
]
=
make_
float
4
(
SCALE
*
qreal
[
indexInBlock
].
x
,
SCALE
*
qreal
[
indexInBlock
].
y
,
SCALE
*
qreal
[
indexInBlock
].
z
,
particlePosq
.
w
);
velm
[
particle
+
indexInBlock
*
PADDED_NUM_ATOMS
]
=
make_
float
4
(
SCALE
*
vreal
[
indexInBlock
].
x
,
SCALE
*
vreal
[
indexInBlock
].
y
,
SCALE
*
vreal
[
indexInBlock
].
z
,
particleVelm
.
w
);
posq
[
particle
+
indexInBlock
*
PADDED_NUM_ATOMS
]
=
make_
mixed
4
(
SCALE
*
qreal
[
indexInBlock
].
x
,
SCALE
*
qreal
[
indexInBlock
].
y
,
SCALE
*
qreal
[
indexInBlock
].
z
,
particlePosq
.
w
);
velm
[
particle
+
indexInBlock
*
PADDED_NUM_ATOMS
]
=
make_
mixed
4
(
SCALE
*
vreal
[
indexInBlock
].
x
,
SCALE
*
vreal
[
indexInBlock
].
y
,
SCALE
*
vreal
[
indexInBlock
].
z
,
particleVelm
.
w
);
}
}
/**
* Advance the velocities by a half step.
*/
extern
"C"
__global__
void
advanceVelocities
(
float
4
*
velm
,
long
long
*
force
,
float
dt
)
{
extern
"C"
__global__
void
advanceVelocities
(
mixed
4
*
velm
,
long
long
*
force
,
mixed
dt
)
{
const
int
numBlocks
=
(
blockDim
.
x
*
gridDim
.
x
)
/
NUM_COPIES
;
const
int
blockStart
=
NUM_COPIES
*
(
threadIdx
.
x
/
NUM_COPIES
);
const
int
indexInBlock
=
threadIdx
.
x
-
blockStart
;
const
float
forceScale
=
1
/
(
float
)
0xFFFFFFFF
;
const
mixed
forceScale
=
1
/
(
mixed
)
0xFFFFFFFF
;
// Update velocities.
for
(
int
particle
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
NUM_COPIES
;
particle
<
NUM_ATOMS
;
particle
+=
numBlocks
)
{
int
index
=
particle
+
indexInBlock
*
PADDED_NUM_ATOMS
;
int
forceIndex
=
particle
+
indexInBlock
*
PADDED_NUM_ATOMS
*
3
;
float
4
particleVelm
=
velm
[
index
];
mixed
4
particleVelm
=
velm
[
index
];
particleVelm
.
x
+=
forceScale
*
force
[
forceIndex
]
*
(
0.5
f
*
dt
*
particleVelm
.
w
);
particleVelm
.
y
+=
forceScale
*
force
[
forceIndex
+
PADDED_NUM_ATOMS
]
*
(
0.5
f
*
dt
*
particleVelm
.
w
);
particleVelm
.
z
+=
forceScale
*
force
[
forceIndex
+
PADDED_NUM_ATOMS
*
2
]
*
(
0.5
f
*
dt
*
particleVelm
.
w
);
...
...
@@ -181,9 +181,20 @@ extern "C" __global__ void advanceVelocities(float4* velm, long long* force, flo
}
/**
* Copy a set of p
er-atom value
s from the integrator's arrays to the context.
* Copy a set of p
osition
s from the integrator's arrays to the context.
*/
extern
"C"
__global__
void
copyToContext
(
float4
*
src
,
float4
*
dst
,
int
*
order
,
int
copy
)
{
extern
"C"
__global__
void
copyPositionsToContext
(
mixed4
*
src
,
real4
*
dst
,
int
*
order
,
int
copy
)
{
const
int
base
=
copy
*
PADDED_NUM_ATOMS
;
for
(
int
particle
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
particle
<
NUM_ATOMS
;
particle
+=
blockDim
.
x
*
gridDim
.
x
)
{
mixed4
posq
=
src
[
base
+
order
[
particle
]];
dst
[
particle
]
=
make_real4
(
posq
.
x
,
posq
.
y
,
posq
.
z
,
posq
.
w
);
}
}
/**
* Copy a set of velocities from the integrator's arrays to the context.
*/
extern
"C"
__global__
void
copyVelocitiesToContext
(
mixed4
*
src
,
mixed4
*
dst
,
int
*
order
,
int
copy
)
{
const
int
base
=
copy
*
PADDED_NUM_ATOMS
;
for
(
int
particle
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
particle
<
NUM_ATOMS
;
particle
+=
blockDim
.
x
*
gridDim
.
x
)
{
dst
[
particle
]
=
src
[
base
+
order
[
particle
]];
...
...
@@ -191,9 +202,9 @@ extern "C" __global__ void copyToContext(float4* src, float4* dst, int* order, i
}
/**
* Copy a set of
per-atom force valu
es from the context to the integrator's arrays.
* Copy a set of
forc
es from the context to the integrator's arrays.
*/
extern
"C"
__global__
void
copyFromContext
(
long
long
*
src
,
long
long
*
dst
,
int
*
order
,
int
copy
)
{
extern
"C"
__global__
void
copyF
orcesF
romContext
(
long
long
*
src
,
long
long
*
dst
,
int
*
order
,
int
copy
)
{
const
int
base
=
copy
*
PADDED_NUM_ATOMS
*
3
;
for
(
int
particle
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
particle
<
NUM_ATOMS
;
particle
+=
blockDim
.
x
*
gridDim
.
x
)
{
dst
[
base
+
order
[
particle
]]
=
src
[
particle
];
...
...
@@ -205,10 +216,11 @@ extern "C" __global__ void copyFromContext(long long* src, long long* dst, int*
/**
* Update atom positions so all copies are offset by the same number of periodic box widths.
*/
extern
"C"
__global__
void
applyCellTranslations
(
float
4
*
posq
,
float
4
*
movedPos
,
int
*
order
,
int
movedCopy
)
{
extern
"C"
__global__
void
applyCellTranslations
(
mixed
4
*
posq
,
real
4
*
movedPos
,
int
*
order
,
int
movedCopy
)
{
for
(
int
particle
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
particle
<
NUM_ATOMS
;
particle
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
index
=
order
[
particle
];
float4
delta
=
movedPos
[
particle
]
-
posq
[
movedCopy
*
PADDED_NUM_ATOMS
+
index
];
real4
p
=
movedPos
[
particle
];
mixed4
delta
=
make_mixed4
(
p
.
x
,
p
.
y
,
p
.
z
,
p
.
w
)
-
posq
[
movedCopy
*
PADDED_NUM_ATOMS
+
index
];
for
(
int
copy
=
0
;
copy
<
NUM_COPIES
;
copy
++
)
posq
[
copy
*
PADDED_NUM_ATOMS
+
index
]
+=
delta
;
}
...
...
plugins/rpmd/platforms/cuda/tests/CMakeLists.txt
View file @
cd6af26e
...
...
@@ -14,6 +14,10 @@ FOREACH(TEST_PROG ${TEST_PROGS})
# Link with shared library
ADD_EXECUTABLE
(
${
TEST_ROOT
}
${
TEST_PROG
}
)
TARGET_LINK_LIBRARIES
(
${
TEST_ROOT
}
${
SHARED_RPMD_TARGET
}
)
ADD_TEST
(
${
TEST_ROOT
}
${
EXECUTABLE_OUTPUT_PATH
}
/
${
TEST_ROOT
}
)
ADD_TEST
(
${
TEST_ROOT
}
Single
${
EXECUTABLE_OUTPUT_PATH
}
/
${
TEST_ROOT
}
single
)
IF
(
OPENMM_BUILD_CUDA_DOUBLE_PRECISION_TESTS
)
ADD_TEST
(
${
TEST_ROOT
}
Mixed
${
EXECUTABLE_OUTPUT_PATH
}
/
${
TEST_ROOT
}
mixed
)
ADD_TEST
(
${
TEST_ROOT
}
Double
${
EXECUTABLE_OUTPUT_PATH
}
/
${
TEST_ROOT
}
double
)
ENDIF
(
OPENMM_BUILD_CUDA_DOUBLE_PRECISION_TESTS
)
ENDFOREACH
(
TEST_PROG
${
TEST_PROGS
}
)
plugins/rpmd/platforms/cuda/tests/TestCudaRpmd.cpp
View file @
cd6af26e
...
...
@@ -165,7 +165,6 @@ void testParaHydrogen() {
vector
<
int
>
counts
(
numBins
,
0
);
const
double
invBoxSize
=
1.0
/
boxSize
;
double
meanKE
=
0.0
;
const
RealOpenMM
hbar
=
1.054571628e-34
*
AVOGADRO
/
(
1000
*
1e-12
);
for
(
int
step
=
0
;
step
<
numSteps
;
step
++
)
{
integ
.
step
(
20
);
vector
<
State
>
states
(
numCopies
);
...
...
@@ -221,9 +220,11 @@ void testParaHydrogen() {
ASSERT_USUALLY_EQUAL_TOL
(
60.0
,
1.5
*
temperature
+
meanKE
,
0.02
);
}
int
main
()
{
int
main
(
int
argc
,
char
*
argv
[]
)
{
try
{
Platform
::
loadPluginsFromDirectory
(
Platform
::
getDefaultPluginsDirectory
());
if
(
argc
>
1
)
Platform
::
getPlatformByName
(
"CUDA"
).
setPropertyDefaultValue
(
"CudaPrecision"
,
string
(
argv
[
1
]));
testFreeParticles
();
testParaHydrogen
();
}
...
...
plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.cpp
View file @
cd6af26e
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011 Stanford University and the Authors.
*
* Portions copyright (c) 2011
-2012
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -49,6 +49,7 @@ OpenCLIntegrateRPMDStepKernel::~OpenCLIntegrateRPMDStepKernel() {
if
(
velocities
!=
NULL
)
delete
velocities
;
}
void
OpenCLIntegrateRPMDStepKernel
::
initialize
(
const
System
&
system
,
const
RPMDIntegrator
&
integrator
)
{
cl
.
getPlatformData
().
initializeContexts
(
system
);
numCopies
=
integrator
.
getNumCopies
();
...
...
@@ -59,20 +60,34 @@ void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDI
if
(
numCopies
!=
OpenCLFFT3D
::
findLegalDimension
(
numCopies
))
throw
OpenMMException
(
"RPMDIntegrator: the number of copies must be a multiple of powers of 2, 3, and 5."
);
int
paddedParticles
=
cl
.
getPaddedNumAtoms
();
forces
=
OpenCLArray
::
create
<
mm_float4
>
(
cl
,
numCopies
*
paddedParticles
,
"rpmdForces"
);
positions
=
OpenCLArray
::
create
<
mm_float4
>
(
cl
,
numCopies
*
paddedParticles
,
"rpmdPositions"
);
velocities
=
OpenCLArray
::
create
<
mm_float4
>
(
cl
,
numCopies
*
paddedParticles
,
"rpmdVelocities"
);
int
forceElementSize
=
(
cl
.
getUseDoublePrecision
()
?
sizeof
(
mm_double4
)
:
sizeof
(
mm_float4
));
forces
=
new
OpenCLArray
(
cl
,
numCopies
*
paddedParticles
,
forceElementSize
,
"rpmdForces"
);
bool
useDoublePrecision
=
(
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
());
int
elementSize
=
(
useDoublePrecision
?
sizeof
(
mm_double4
)
:
sizeof
(
mm_float4
));
positions
=
new
OpenCLArray
(
cl
,
numCopies
*
paddedParticles
,
elementSize
,
"rpmdPositions"
);
velocities
=
new
OpenCLArray
(
cl
,
numCopies
*
paddedParticles
,
elementSize
,
"rpmdVelocities"
);
cl
.
getIntegrationUtilities
().
initRandomNumberGenerator
((
unsigned
int
)
integrator
.
getRandomNumberSeed
());
// Fill in the posq and velm arrays with safe values to avoid a risk of nans.
vector
<
mm_float4
>
temp
(
positions
->
getSize
());
for
(
int
i
=
0
;
i
<
positions
->
getSize
();
i
++
)
temp
[
i
]
=
mm_float4
(
0
,
0
,
0
,
0
);
positions
->
upload
(
temp
);
for
(
int
i
=
0
;
i
<
velocities
->
getSize
();
i
++
)
temp
[
i
]
=
mm_float4
(
0
,
0
,
0
,
1
);
velocities
->
upload
(
temp
);
if
(
useDoublePrecision
)
{
vector
<
mm_double4
>
temp
(
positions
->
getSize
());
for
(
int
i
=
0
;
i
<
positions
->
getSize
();
i
++
)
temp
[
i
]
=
mm_double4
(
0
,
0
,
0
,
0
);
positions
->
upload
(
temp
);
for
(
int
i
=
0
;
i
<
velocities
->
getSize
();
i
++
)
temp
[
i
]
=
mm_double4
(
0
,
0
,
0
,
1
);
velocities
->
upload
(
temp
);
}
else
{
vector
<
mm_float4
>
temp
(
positions
->
getSize
());
for
(
int
i
=
0
;
i
<
positions
->
getSize
();
i
++
)
temp
[
i
]
=
mm_float4
(
0
,
0
,
0
,
0
);
positions
->
upload
(
temp
);
for
(
int
i
=
0
;
i
<
velocities
->
getSize
();
i
++
)
temp
[
i
]
=
mm_float4
(
0
,
0
,
0
,
1
);
velocities
->
upload
(
temp
);
}
// Create kernels.
...
...
@@ -80,6 +95,7 @@ void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDI
defines
[
"NUM_ATOMS"
]
=
cl
.
intToString
(
cl
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
cl
.
intToString
(
cl
.
getPaddedNumAtoms
());
defines
[
"NUM_COPIES"
]
=
cl
.
intToString
(
numCopies
);
defines
[
"THREAD_BLOCK_SIZE"
]
=
cl
.
intToString
(
workgroupSize
);
defines
[
"HBAR"
]
=
cl
.
doubleToString
(
1.054571628e-34
*
AVOGADRO
/
(
1000
*
1e-12
));
defines
[
"SCALE"
]
=
cl
.
doubleToString
(
1.0
/
sqrt
((
double
)
numCopies
));
defines
[
"M_PI"
]
=
cl
.
doubleToString
(
M_PI
);
...
...
@@ -92,8 +108,9 @@ void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDI
pileKernel
=
cl
::
Kernel
(
program
,
"applyPileThermostat"
);
stepKernel
=
cl
::
Kernel
(
program
,
"integrateStep"
);
velocitiesKernel
=
cl
::
Kernel
(
program
,
"advanceVelocities"
);
copyToContextKernel
=
cl
::
Kernel
(
program
,
"copyToContext"
);
copyFromContextKernel
=
cl
::
Kernel
(
program
,
"copyFromContext"
);
copyPositionsToContextKernel
=
cl
::
Kernel
(
program
,
"copyPositionsToContext"
);
copyVelocitiesToContextKernel
=
cl
::
Kernel
(
program
,
"copyVelocitiesToContext"
);
copyForcesFromContextKernel
=
cl
::
Kernel
(
program
,
"copyForcesFromContext"
);
translateKernel
=
cl
::
Kernel
(
program
,
"applyCellTranslations"
);
}
...
...
@@ -102,48 +119,56 @@ void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDInte
if
(
!
hasInitializedKernel
)
{
hasInitializedKernel
=
true
;
pileKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
velocities
->
getDeviceBuffer
());
pileKernel
.
setArg
(
1
,
2
*
workgroupSize
*
sizeof
(
mm_float4
),
NULL
);
pileKernel
.
setArg
(
2
,
2
*
workgroupSize
*
sizeof
(
mm_float4
),
NULL
);
pileKernel
.
setArg
(
3
,
numCopies
*
sizeof
(
mm_float2
),
NULL
);
stepKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
positions
->
getDeviceBuffer
());
stepKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
velocities
->
getDeviceBuffer
());
stepKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
forces
->
getDeviceBuffer
());
stepKernel
.
setArg
(
3
,
2
*
workgroupSize
*
sizeof
(
mm_float4
),
NULL
);
stepKernel
.
setArg
(
4
,
2
*
workgroupSize
*
sizeof
(
mm_float4
),
NULL
);
stepKernel
.
setArg
(
5
,
2
*
workgroupSize
*
sizeof
(
mm_float4
),
NULL
);
stepKernel
.
setArg
(
6
,
numCopies
*
sizeof
(
mm_float2
),
NULL
);
velocitiesKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
velocities
->
getDeviceBuffer
());
velocitiesKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
forces
->
getDeviceBuffer
());
translateKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
positions
->
getDeviceBuffer
());
translateKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getPosq
().
getDeviceBuffer
());
translateKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
cl
.
getAtomIndexArray
().
getDeviceBuffer
());
copyPositionsToContextKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
positions
->
getDeviceBuffer
());
copyPositionsToContextKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getPosq
().
getDeviceBuffer
());
copyPositionsToContextKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
cl
.
getAtomIndexArray
().
getDeviceBuffer
());
copyVelocitiesToContextKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
velocities
->
getDeviceBuffer
());
copyVelocitiesToContextKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getVelm
().
getDeviceBuffer
());
copyVelocitiesToContextKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
cl
.
getAtomIndexArray
().
getDeviceBuffer
());
copyForcesFromContextKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getForce
().
getDeviceBuffer
());
copyForcesFromContextKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
forces
->
getDeviceBuffer
());
copyForcesFromContextKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
cl
.
getAtomIndexArray
().
getDeviceBuffer
());
}
// Loop over copies and compute the force on each one.
copyToContextKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
positions
->
getDeviceBuffer
());
copyToContextKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getPosq
().
getDeviceBuffer
());
copyToContextKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
cl
.
getAtomIndexArray
().
getDeviceBuffer
());
copyFromContextKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getForce
().
getDeviceBuffer
());
copyFromContextKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
forces
->
getDeviceBuffer
());
copyFromContextKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
cl
.
getAtomIndexArray
().
getDeviceBuffer
());
if
(
!
forcesAreValid
)
computeForces
(
context
);
// Apply the PILE-L thermostat.
bool
useDoublePrecision
=
(
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
());
const
double
dt
=
integrator
.
getStepSize
();
pileKernel
.
setArg
<
cl_uint
>
(
5
,
integration
.
prepareRandomNumbers
(
numParticles
*
numCopies
));
pileKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
integration
.
getRandom
().
getDeviceBuffer
());
// Do this *after* prepareRandomNumbers(), which might rebuild the array.
pileKernel
.
setArg
<
cl_float
>
(
6
,
(
cl_float
)
dt
);
pileKernel
.
setArg
<
cl_float
>
(
7
,
(
cl_float
)
(
integrator
.
getTemperature
()
*
BOLTZ
));
pileKernel
.
setArg
<
cl_float
>
(
8
,
(
cl_float
)
integrator
.
getFriction
());
pileKernel
.
setArg
<
cl_uint
>
(
2
,
integration
.
prepareRandomNumbers
(
numParticles
*
numCopies
));
pileKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
integration
.
getRandom
().
getDeviceBuffer
());
// Do this *after* prepareRandomNumbers(), which might rebuild the array.
if
(
useDoublePrecision
)
{
pileKernel
.
setArg
<
cl_double
>
(
3
,
dt
);
pileKernel
.
setArg
<
cl_double
>
(
4
,
integrator
.
getTemperature
()
*
BOLTZ
);
pileKernel
.
setArg
<
cl_double
>
(
5
,
integrator
.
getFriction
());
stepKernel
.
setArg
<
cl_double
>
(
3
,
dt
);
stepKernel
.
setArg
<
cl_double
>
(
4
,
integrator
.
getTemperature
()
*
BOLTZ
);
velocitiesKernel
.
setArg
<
cl_double
>
(
2
,
dt
);
}
else
{
pileKernel
.
setArg
<
cl_float
>
(
3
,
(
cl_float
)
dt
);
pileKernel
.
setArg
<
cl_float
>
(
4
,
(
cl_float
)
(
integrator
.
getTemperature
()
*
BOLTZ
));
pileKernel
.
setArg
<
cl_float
>
(
5
,
(
cl_float
)
integrator
.
getFriction
());
stepKernel
.
setArg
<
cl_float
>
(
3
,
(
cl_float
)
dt
);
stepKernel
.
setArg
<
cl_float
>
(
4
,
(
cl_float
)
(
integrator
.
getTemperature
()
*
BOLTZ
));
velocitiesKernel
.
setArg
<
cl_float
>
(
2
,
(
cl_float
)
dt
);
}
cl
.
executeKernel
(
pileKernel
,
numParticles
*
numCopies
,
workgroupSize
);
// Update positions and velocities.
stepKernel
.
setArg
<
cl_float
>
(
7
,
(
cl_float
)
dt
);
stepKernel
.
setArg
<
cl_float
>
(
8
,
(
cl_float
)
(
integrator
.
getTemperature
()
*
BOLTZ
));
cl
.
executeKernel
(
stepKernel
,
numParticles
*
numCopies
,
workgroupSize
);
// Calculate forces based on the updated positions.
...
...
@@ -151,12 +176,11 @@ void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDInte
computeForces
(
context
);
// Update velocities.
velocitiesKernel
.
setArg
<
cl_float
>
(
2
,
(
cl_float
)
dt
);
cl
.
executeKernel
(
velocitiesKernel
,
numParticles
*
numCopies
,
workgroupSize
);
// Apply the PILE-L thermostat again.
pileKernel
.
setArg
<
cl_uint
>
(
5
,
integration
.
prepareRandomNumbers
(
numParticles
*
numCopies
));
pileKernel
.
setArg
<
cl_uint
>
(
2
,
integration
.
prepareRandomNumbers
(
numParticles
*
numCopies
));
cl
.
executeKernel
(
pileKernel
,
numParticles
*
numCopies
,
workgroupSize
);
// Update the time and step count.
...
...
@@ -167,11 +191,11 @@ void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDInte
void
OpenCLIntegrateRPMDStepKernel
::
computeForces
(
ContextImpl
&
context
)
{
for
(
int
i
=
0
;
i
<
numCopies
;
i
++
)
{
copyToContextKernel
.
setArg
<
cl_int
>
(
3
,
i
);
cl
.
executeKernel
(
copyToContextKernel
,
cl
.
getNumAtoms
());
copy
Positions
ToContextKernel
.
setArg
<
cl_int
>
(
3
,
i
);
cl
.
executeKernel
(
copy
Positions
ToContextKernel
,
cl
.
getNumAtoms
());
context
.
calcForcesAndEnergy
(
true
,
false
);
copyFromContextKernel
.
setArg
<
cl_int
>
(
3
,
i
);
cl
.
executeKernel
(
copyFromContextKernel
,
cl
.
getNumAtoms
());
copyF
orcesF
romContextKernel
.
setArg
<
cl_int
>
(
3
,
i
);
cl
.
executeKernel
(
copyF
orcesF
romContextKernel
,
cl
.
getNumAtoms
());
if
(
cl
.
getAtomsWereReordered
()
&&
cl
.
getNonbondedUtilities
().
getUsePeriodic
())
{
// Atoms may have been translated into a different periodic box, so apply
// the same translation to all the beads.
...
...
@@ -191,11 +215,28 @@ void OpenCLIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& p
throw
OpenMMException
(
"RPMDIntegrator: Cannot set positions before the integrator is added to a Context"
);
if
(
pos
.
size
()
!=
numParticles
)
throw
OpenMMException
(
"RPMDIntegrator: wrong number of values passed to setPositions()"
);
vector
<
mm_float4
>
posq
(
cl
.
getPaddedNumAtoms
());
cl
.
getPosq
().
download
(
posq
);
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
posq
[
i
]
=
mm_float4
((
cl_float
)
pos
[
i
][
0
],
(
cl_float
)
pos
[
i
][
1
],
(
cl_float
)
pos
[
i
][
2
],
posq
[
i
].
w
);
cl
.
getQueue
().
enqueueWriteBuffer
(
positions
->
getDeviceBuffer
(),
CL_TRUE
,
copy
*
cl
.
getPaddedNumAtoms
()
*
sizeof
(
mm_float4
),
numParticles
*
sizeof
(
mm_float4
),
&
posq
[
0
]);
if
(
cl
.
getUseDoublePrecision
())
{
vector
<
mm_double4
>
posq
(
cl
.
getPaddedNumAtoms
());
cl
.
getPosq
().
download
(
posq
);
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
posq
[
i
]
=
mm_double4
(
pos
[
i
][
0
],
pos
[
i
][
1
],
pos
[
i
][
2
],
posq
[
i
].
w
);
cl
.
getQueue
().
enqueueWriteBuffer
(
positions
->
getDeviceBuffer
(),
CL_TRUE
,
copy
*
cl
.
getPaddedNumAtoms
()
*
sizeof
(
mm_double4
),
numParticles
*
sizeof
(
mm_double4
),
&
posq
[
0
]);
}
else
if
(
cl
.
getUseMixedPrecision
())
{
vector
<
mm_float4
>
posqf
(
cl
.
getPaddedNumAtoms
());
cl
.
getPosq
().
download
(
posqf
);
vector
<
mm_double4
>
posq
(
cl
.
getPaddedNumAtoms
());
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
posq
[
i
]
=
mm_double4
(
pos
[
i
][
0
],
pos
[
i
][
1
],
pos
[
i
][
2
],
posqf
[
i
].
w
);
cl
.
getQueue
().
enqueueWriteBuffer
(
positions
->
getDeviceBuffer
(),
CL_TRUE
,
copy
*
cl
.
getPaddedNumAtoms
()
*
sizeof
(
mm_double4
),
numParticles
*
sizeof
(
mm_double4
),
&
posq
[
0
]);
}
else
{
vector
<
mm_float4
>
posq
(
cl
.
getPaddedNumAtoms
());
cl
.
getPosq
().
download
(
posq
);
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
posq
[
i
]
=
mm_float4
((
cl_float
)
pos
[
i
][
0
],
(
cl_float
)
pos
[
i
][
1
],
(
cl_float
)
pos
[
i
][
2
],
posq
[
i
].
w
);
cl
.
getQueue
().
enqueueWriteBuffer
(
positions
->
getDeviceBuffer
(),
CL_TRUE
,
copy
*
cl
.
getPaddedNumAtoms
()
*
sizeof
(
mm_float4
),
numParticles
*
sizeof
(
mm_float4
),
&
posq
[
0
]);
}
}
void
OpenCLIntegrateRPMDStepKernel
::
setVelocities
(
int
copy
,
const
vector
<
Vec3
>&
vel
)
{
...
...
@@ -203,22 +244,27 @@ void OpenCLIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>&
throw
OpenMMException
(
"RPMDIntegrator: Cannot set velocities before the integrator is added to a Context"
);
if
(
vel
.
size
()
!=
numParticles
)
throw
OpenMMException
(
"RPMDIntegrator: wrong number of values passed to setVelocities()"
);
vector
<
mm_float4
>
velm
(
cl
.
getPaddedNumAtoms
());
cl
.
getVelm
().
download
(
velm
);
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
velm
[
i
]
=
mm_float4
((
cl_float
)
vel
[
i
][
0
],
(
cl_float
)
vel
[
i
][
1
],
(
cl_float
)
vel
[
i
][
2
],
velm
[
i
].
w
);
cl
.
getQueue
().
enqueueWriteBuffer
(
velocities
->
getDeviceBuffer
(),
CL_TRUE
,
copy
*
cl
.
getPaddedNumAtoms
()
*
sizeof
(
mm_float4
),
numParticles
*
sizeof
(
mm_float4
),
&
velm
[
0
]);
if
(
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
())
{
vector
<
mm_double4
>
velm
(
cl
.
getPaddedNumAtoms
());
cl
.
getVelm
().
download
(
velm
);
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
velm
[
i
]
=
mm_double4
(
vel
[
i
][
0
],
vel
[
i
][
1
],
vel
[
i
][
2
],
velm
[
i
].
w
);
cl
.
getQueue
().
enqueueWriteBuffer
(
velocities
->
getDeviceBuffer
(),
CL_TRUE
,
copy
*
cl
.
getPaddedNumAtoms
()
*
sizeof
(
mm_double4
),
numParticles
*
sizeof
(
mm_double4
),
&
velm
[
0
]);
}
else
{
vector
<
mm_float4
>
velm
(
cl
.
getPaddedNumAtoms
());
cl
.
getVelm
().
download
(
velm
);
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
velm
[
i
]
=
mm_float4
((
cl_float
)
vel
[
i
][
0
],
(
cl_float
)
vel
[
i
][
1
],
(
cl_float
)
vel
[
i
][
2
],
velm
[
i
].
w
);
cl
.
getQueue
().
enqueueWriteBuffer
(
velocities
->
getDeviceBuffer
(),
CL_TRUE
,
copy
*
cl
.
getPaddedNumAtoms
()
*
sizeof
(
mm_float4
),
numParticles
*
sizeof
(
mm_float4
),
&
velm
[
0
]);
}
}
void
OpenCLIntegrateRPMDStepKernel
::
copyToContext
(
int
copy
,
ContextImpl
&
context
)
{
copyToContextKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
positions
->
getDeviceBuffer
());
copyToContextKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getPosq
().
getDeviceBuffer
());
copyToContextKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
cl
.
getAtomIndexArray
().
getDeviceBuffer
());
copyToContextKernel
.
setArg
<
cl_int
>
(
3
,
copy
);
cl
.
executeKernel
(
copyToContextKernel
,
cl
.
getNumAtoms
());
copyToContextKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
velocities
->
getDeviceBuffer
());
copyToContextKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getVelm
().
getDeviceBuffer
());
cl
.
executeKernel
(
copyToContextKernel
,
cl
.
getNumAtoms
());
copyPositionsToContextKernel
.
setArg
<
cl_int
>
(
3
,
copy
);
cl
.
executeKernel
(
copyPositionsToContextKernel
,
cl
.
getNumAtoms
());
copyVelocitiesToContextKernel
.
setArg
<
cl_int
>
(
3
,
copy
);
cl
.
executeKernel
(
copyVelocitiesToContextKernel
,
cl
.
getNumAtoms
());
}
string
OpenCLIntegrateRPMDStepKernel
::
createFFT
(
int
size
,
const
string
&
variable
,
bool
forward
)
{
...
...
@@ -232,10 +278,10 @@ string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable
string
multImag
=
(
forward
?
"multiplyComplexImagPart"
:
"multiplyComplexImagPartConj"
);
source
<<
"{
\n
"
;
source
<<
"__local
float
4* real0 = "
<<
variable
<<
"real;
\n
"
;
source
<<
"__local
float
4* imag0 = "
<<
variable
<<
"imag;
\n
"
;
source
<<
"__local
float
4* real1 = &temp[blockStart];
\n
"
;
source
<<
"__local
float
4* imag1 = &temp[blockStart+get_local_size(0)];
\n
"
;
source
<<
"__local
mixed
4* real0 = "
<<
variable
<<
"real;
\n
"
;
source
<<
"__local
mixed
4* imag0 = "
<<
variable
<<
"imag;
\n
"
;
source
<<
"__local
mixed
4* real1 = &temp[blockStart];
\n
"
;
source
<<
"__local
mixed
4* imag1 = &temp[blockStart+get_local_size(0)];
\n
"
;
// Factor size, generating an appropriate block of code for each factor.
...
...
@@ -249,39 +295,39 @@ string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable
source
<<
"if (indexInBlock < "
<<
(
L
*
m
)
<<
") {
\n
"
;
source
<<
"int i = indexInBlock;
\n
"
;
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
source
<<
"
float
4 c0r = real"
<<
input
<<
"[i];
\n
"
;
source
<<
"
float
4 c0i = imag"
<<
input
<<
"[i];
\n
"
;
source
<<
"
float
4 c1r = real"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
4 c1i = imag"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
4 c2r = real"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
4 c2i = imag"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
4 c3r = real"
<<
input
<<
"[i+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
4 c3i = imag"
<<
input
<<
"[i+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
4 c4r = real"
<<
input
<<
"[i+"
<<
(
4
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
4 c4i = imag"
<<
input
<<
"[i+"
<<
(
4
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
4 d0r = c1r+c4r;
\n
"
;
source
<<
"
float
4 d0i = c1i+c4i;
\n
"
;
source
<<
"
float
4 d1r = c2r+c3r;
\n
"
;
source
<<
"
float
4 d1i = c2i+c3i;
\n
"
;
source
<<
"
float
4 d2r = "
<<
cl
.
doubleToString
(
sin
(
0.4
*
M_PI
))
<<
"*(c1r-c4r);
\n
"
;
source
<<
"
float
4 d2i = "
<<
cl
.
doubleToString
(
sin
(
0.4
*
M_PI
))
<<
"*(c1i-c4i);
\n
"
;
source
<<
"
float
4 d3r = "
<<
cl
.
doubleToString
(
sin
(
0.4
*
M_PI
))
<<
"*(c2r-c3r);
\n
"
;
source
<<
"
float
4 d3i = "
<<
cl
.
doubleToString
(
sin
(
0.4
*
M_PI
))
<<
"*(c2i-c3i);
\n
"
;
source
<<
"
float
4 d4r = d0r+d1r;
\n
"
;
source
<<
"
float
4 d4i = d0i+d1i;
\n
"
;
source
<<
"
float
4 d5r = "
<<
cl
.
doubleToString
(
0.25
*
sqrt
(
5.0
))
<<
"*(d0r-d1r);
\n
"
;
source
<<
"
float
4 d5i = "
<<
cl
.
doubleToString
(
0.25
*
sqrt
(
5.0
))
<<
"*(d0i-d1i);
\n
"
;
source
<<
"
float
4 d6r = c0r-0.25f*d4r;
\n
"
;
source
<<
"
float
4 d6i = c0i-0.25f*d4i;
\n
"
;
source
<<
"
float
4 d7r = d6r+d5r;
\n
"
;
source
<<
"
float
4 d7i = d6i+d5i;
\n
"
;
source
<<
"
float
4 d8r = d6r-d5r;
\n
"
;
source
<<
"
float
4 d8i = d6i-d5i;
\n
"
;
source
<<
"
mixed
4 c0r = real"
<<
input
<<
"[i];
\n
"
;
source
<<
"
mixed
4 c0i = imag"
<<
input
<<
"[i];
\n
"
;
source
<<
"
mixed
4 c1r = real"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
4 c1i = imag"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
4 c2r = real"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
4 c2i = imag"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
4 c3r = real"
<<
input
<<
"[i+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
4 c3i = imag"
<<
input
<<
"[i+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
4 c4r = real"
<<
input
<<
"[i+"
<<
(
4
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
4 c4i = imag"
<<
input
<<
"[i+"
<<
(
4
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
4 d0r = c1r+c4r;
\n
"
;
source
<<
"
mixed
4 d0i = c1i+c4i;
\n
"
;
source
<<
"
mixed
4 d1r = c2r+c3r;
\n
"
;
source
<<
"
mixed
4 d1i = c2i+c3i;
\n
"
;
source
<<
"
mixed
4 d2r = "
<<
cl
.
doubleToString
(
sin
(
0.4
*
M_PI
))
<<
"*(c1r-c4r);
\n
"
;
source
<<
"
mixed
4 d2i = "
<<
cl
.
doubleToString
(
sin
(
0.4
*
M_PI
))
<<
"*(c1i-c4i);
\n
"
;
source
<<
"
mixed
4 d3r = "
<<
cl
.
doubleToString
(
sin
(
0.4
*
M_PI
))
<<
"*(c2r-c3r);
\n
"
;
source
<<
"
mixed
4 d3i = "
<<
cl
.
doubleToString
(
sin
(
0.4
*
M_PI
))
<<
"*(c2i-c3i);
\n
"
;
source
<<
"
mixed
4 d4r = d0r+d1r;
\n
"
;
source
<<
"
mixed
4 d4i = d0i+d1i;
\n
"
;
source
<<
"
mixed
4 d5r = "
<<
cl
.
doubleToString
(
0.25
*
sqrt
(
5.0
))
<<
"*(d0r-d1r);
\n
"
;
source
<<
"
mixed
4 d5i = "
<<
cl
.
doubleToString
(
0.25
*
sqrt
(
5.0
))
<<
"*(d0i-d1i);
\n
"
;
source
<<
"
mixed
4 d6r = c0r-0.25f*d4r;
\n
"
;
source
<<
"
mixed
4 d6i = c0i-0.25f*d4i;
\n
"
;
source
<<
"
mixed
4 d7r = d6r+d5r;
\n
"
;
source
<<
"
mixed
4 d7i = d6i+d5i;
\n
"
;
source
<<
"
mixed
4 d8r = d6r-d5r;
\n
"
;
source
<<
"
mixed
4 d8i = d6i-d5i;
\n
"
;
string
coeff
=
cl
.
doubleToString
(
sin
(
0.2
*
M_PI
)
/
sin
(
0.4
*
M_PI
));
source
<<
"
float
4 d9r = "
<<
sign
<<
"*(d2i+"
<<
coeff
<<
"*d3i);
\n
"
;
source
<<
"
float
4 d9i = "
<<
sign
<<
"*(-d2r-"
<<
coeff
<<
"*d3r);
\n
"
;
source
<<
"
float
4 d10r = "
<<
sign
<<
"*("
<<
coeff
<<
"*d2i-d3i);
\n
"
;
source
<<
"
float
4 d10i = "
<<
sign
<<
"*(d3r-"
<<
coeff
<<
"*d2r);
\n
"
;
source
<<
"
mixed
4 d9r = "
<<
sign
<<
"*(d2i+"
<<
coeff
<<
"*d3i);
\n
"
;
source
<<
"
mixed
4 d9i = "
<<
sign
<<
"*(-d2r-"
<<
coeff
<<
"*d3r);
\n
"
;
source
<<
"
mixed
4 d10r = "
<<
sign
<<
"*("
<<
coeff
<<
"*d2i-d3i);
\n
"
;
source
<<
"
mixed
4 d10i = "
<<
sign
<<
"*(d3r-"
<<
coeff
<<
"*d2r);
\n
"
;
source
<<
"real"
<<
output
<<
"[i+4*j*"
<<
m
<<
"] = c0r+d4r;
\n
"
;
source
<<
"imag"
<<
output
<<
"[i+4*j*"
<<
m
<<
"] = c0i+d4i;
\n
"
;
source
<<
"real"
<<
output
<<
"[i+(4*j+1)*"
<<
m
<<
"] = "
<<
multReal
<<
"(w[j*"
<<
size
<<
"/"
<<
(
5
*
L
)
<<
"], d7r+d9r, d7i+d9i);
\n
"
;
...
...
@@ -302,22 +348,22 @@ string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable
source
<<
"if (indexInBlock < "
<<
(
L
*
m
)
<<
") {
\n
"
;
source
<<
"int i = indexInBlock;
\n
"
;
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
source
<<
"
float
4 c0r = real"
<<
input
<<
"[i];
\n
"
;
source
<<
"
float
4 c0i = imag"
<<
input
<<
"[i];
\n
"
;
source
<<
"
float
4 c1r = real"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
4 c1i = imag"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
4 c2r = real"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
4 c2i = imag"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
4 c3r = real"
<<
input
<<
"[i+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
4 c3i = imag"
<<
input
<<
"[i+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
4 d0r = c0r+c2r;
\n
"
;
source
<<
"
float
4 d0i = c0i+c2i;
\n
"
;
source
<<
"
float
4 d1r = c0r-c2r;
\n
"
;
source
<<
"
float
4 d1i = c0i-c2i;
\n
"
;
source
<<
"
float
4 d2r = c1r+c3r;
\n
"
;
source
<<
"
float
4 d2i = c1i+c3i;
\n
"
;
source
<<
"
float
4 d3r = "
<<
sign
<<
"*(c1i-c3i);
\n
"
;
source
<<
"
float
4 d3i = "
<<
sign
<<
"*(c3r-c1r);
\n
"
;
source
<<
"
mixed
4 c0r = real"
<<
input
<<
"[i];
\n
"
;
source
<<
"
mixed
4 c0i = imag"
<<
input
<<
"[i];
\n
"
;
source
<<
"
mixed
4 c1r = real"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
4 c1i = imag"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
4 c2r = real"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
4 c2i = imag"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
4 c3r = real"
<<
input
<<
"[i+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
4 c3i = imag"
<<
input
<<
"[i+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
4 d0r = c0r+c2r;
\n
"
;
source
<<
"
mixed
4 d0i = c0i+c2i;
\n
"
;
source
<<
"
mixed
4 d1r = c0r-c2r;
\n
"
;
source
<<
"
mixed
4 d1i = c0i-c2i;
\n
"
;
source
<<
"
mixed
4 d2r = c1r+c3r;
\n
"
;
source
<<
"
mixed
4 d2i = c1i+c3i;
\n
"
;
source
<<
"
mixed
4 d3r = "
<<
sign
<<
"*(c1i-c3i);
\n
"
;
source
<<
"
mixed
4 d3i = "
<<
sign
<<
"*(c3r-c1r);
\n
"
;
source
<<
"real"
<<
output
<<
"[i+3*j*"
<<
m
<<
"] = d0r+d2r;
\n
"
;
source
<<
"imag"
<<
output
<<
"[i+3*j*"
<<
m
<<
"] = d0i+d2i;
\n
"
;
source
<<
"real"
<<
output
<<
"[i+(3*j+1)*"
<<
m
<<
"] = "
<<
multReal
<<
"(w[j*"
<<
size
<<
"/"
<<
(
4
*
L
)
<<
"], d1r+d3r, d1i+d3i);
\n
"
;
...
...
@@ -336,18 +382,18 @@ string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable
source
<<
"if (indexInBlock < "
<<
(
L
*
m
)
<<
") {
\n
"
;
source
<<
"int i = indexInBlock;
\n
"
;
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
source
<<
"
float
4 c0r = real"
<<
input
<<
"[i];
\n
"
;
source
<<
"
float
4 c0i = imag"
<<
input
<<
"[i];
\n
"
;
source
<<
"
float
4 c1r = real"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
4 c1i = imag"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
4 c2r = real"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
4 c2i = imag"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
4 d0r = c1r+c2r;
\n
"
;
source
<<
"
float
4 d0i = c1i+c2i;
\n
"
;
source
<<
"
float
4 d1r = c0r-0.5f*d0r;
\n
"
;
source
<<
"
float
4 d1i = c0i-0.5f*d0i;
\n
"
;
source
<<
"
float
4 d2r = "
<<
sign
<<
"*"
<<
cl
.
doubleToString
(
sin
(
M_PI
/
3.0
))
<<
"*(c1i-c2i);
\n
"
;
source
<<
"
float
4 d2i = "
<<
sign
<<
"*"
<<
cl
.
doubleToString
(
sin
(
M_PI
/
3.0
))
<<
"*(c2r-c1r);
\n
"
;
source
<<
"
mixed
4 c0r = real"
<<
input
<<
"[i];
\n
"
;
source
<<
"
mixed
4 c0i = imag"
<<
input
<<
"[i];
\n
"
;
source
<<
"
mixed
4 c1r = real"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
4 c1i = imag"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
4 c2r = real"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
4 c2i = imag"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
4 d0r = c1r+c2r;
\n
"
;
source
<<
"
mixed
4 d0i = c1i+c2i;
\n
"
;
source
<<
"
mixed
4 d1r = c0r-0.5f*d0r;
\n
"
;
source
<<
"
mixed
4 d1i = c0i-0.5f*d0i;
\n
"
;
source
<<
"
mixed
4 d2r = "
<<
sign
<<
"*"
<<
cl
.
doubleToString
(
sin
(
M_PI
/
3.0
))
<<
"*(c1i-c2i);
\n
"
;
source
<<
"
mixed
4 d2i = "
<<
sign
<<
"*"
<<
cl
.
doubleToString
(
sin
(
M_PI
/
3.0
))
<<
"*(c2r-c1r);
\n
"
;
source
<<
"real"
<<
output
<<
"[i+2*j*"
<<
m
<<
"] = c0r+d0r;
\n
"
;
source
<<
"imag"
<<
output
<<
"[i+2*j*"
<<
m
<<
"] = c0i+d0i;
\n
"
;
source
<<
"real"
<<
output
<<
"[i+(2*j+1)*"
<<
m
<<
"] = "
<<
multReal
<<
"(w[j*"
<<
size
<<
"/"
<<
(
3
*
L
)
<<
"], d1r+d2r, d1i+d2i);
\n
"
;
...
...
@@ -364,10 +410,10 @@ string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable
source
<<
"if (indexInBlock < "
<<
(
L
*
m
)
<<
") {
\n
"
;
source
<<
"int i = indexInBlock;
\n
"
;
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
source
<<
"
float
4 c0r = real"
<<
input
<<
"[i];
\n
"
;
source
<<
"
float
4 c0i = imag"
<<
input
<<
"[i];
\n
"
;
source
<<
"
float
4 c1r = real"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
float
4 c1i = imag"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
4 c0r = real"
<<
input
<<
"[i];
\n
"
;
source
<<
"
mixed
4 c0i = imag"
<<
input
<<
"[i];
\n
"
;
source
<<
"
mixed
4 c1r = real"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"
mixed
4 c1i = imag"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"real"
<<
output
<<
"[i+j*"
<<
m
<<
"] = c0r+c1r;
\n
"
;
source
<<
"imag"
<<
output
<<
"[i+j*"
<<
m
<<
"] = c0i+c1i;
\n
"
;
source
<<
"real"
<<
output
<<
"[i+(j+1)*"
<<
m
<<
"] = "
<<
multReal
<<
"(w[j*"
<<
size
<<
"/"
<<
(
2
*
L
)
<<
"], c0r-c1r, c0i-c1i);
\n
"
;
...
...
plugins/rpmd/platforms/opencl/src/OpenCLRpmdKernels.h
View file @
cd6af26e
...
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011 Stanford University and the Authors.
*
* Portions copyright (c) 2011
-2012
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -92,7 +92,7 @@ private:
OpenCLArray
*
forces
;
OpenCLArray
*
positions
;
OpenCLArray
*
velocities
;
cl
::
Kernel
pileKernel
,
stepKernel
,
velocitiesKernel
,
copyToContextKernel
,
copyFromContextKernel
,
translateKernel
;
cl
::
Kernel
pileKernel
,
stepKernel
,
velocitiesKernel
,
copy
Positions
ToContextKernel
,
copy
VelocitiesToContextKernel
,
copyForces
FromContextKernel
,
translateKernel
;
};
}
// namespace OpenMM
...
...
plugins/rpmd/platforms/opencl/src/kernels/rpmd.cl
View file @
cd6af26e
float
4
multiplyComplexRealPart
(
float
2
c1,
float
4
c2r,
float
4
c2i
)
{
mixed
4
multiplyComplexRealPart
(
mixed
2
c1,
mixed
4
c2r,
mixed
4
c2i
)
{
return
c1.x*c2r-c1.y*c2i
;
}
float
4
multiplyComplexImagPart
(
float
2
c1,
float
4
c2r,
float
4
c2i
)
{
mixed
4
multiplyComplexImagPart
(
mixed
2
c1,
mixed
4
c2r,
mixed
4
c2i
)
{
return
c1.x*c2i+c1.y*c2r
;
}
float
4
multiplyComplexRealPartConj
(
float
2
c1,
float
4
c2r,
float
4
c2i
)
{
mixed
4
multiplyComplexRealPartConj
(
mixed
2
c1,
mixed
4
c2r,
mixed
4
c2i
)
{
return
c1.x*c2r+c1.y*c2i
;
}
float
4
multiplyComplexImagPartConj
(
float
2
c1,
float
4
c2r,
float
4
c2i
)
{
mixed
4
multiplyComplexImagPartConj
(
mixed
2
c1,
mixed
4
c2r,
mixed
4
c2i
)
{
return
c1.x*c2i-c1.y*c2r
;
}
/**
*
Apply
the
PILE-L
thermostat.
*/
__kernel
void
applyPileThermostat
(
__global
float4*
velm,
__local
float4*
v,
__local
float4*
temp,
__local
float2*
w
,
__global
float4*
random,
unsigned
int
randomIndex,
float
dt,
float
kT,
float
friction
)
{
__kernel
void
applyPileThermostat
(
__global
mixed4*
velm
,
__global
float4*
random,
unsigned
int
randomIndex,
mixed
dt,
mixed
kT,
mixed
friction
)
{
const
int
numBlocks
=
get_global_size
(
0
)
/NUM_COPIES
;
const
int
blockStart
=
NUM_COPIES*
(
get_local_id
(
0
)
/NUM_COPIES
)
;
const
int
indexInBlock
=
get_local_id
(
0
)
-blockStart
;
const
float
nkT
=
NUM_COPIES*kT
;
const
float
twown
=
2.0f*nkT/HBAR
;
const
float
c1_0
=
EXP
(
-0.5f*dt*friction
)
;
const
float
c2_0
=
SQRT
(
1.0f-c1_0*c1_0
)
;
__local
float4*
vreal
=
&v[blockStart]
;
__local
float4*
vimag
=
&v[blockStart+get_local_size
(
0
)
]
;
const
mixed
nkT
=
NUM_COPIES*kT
;
const
mixed
twown
=
2.0f*nkT/HBAR
;
const
mixed
c1_0
=
exp
(
-0.5f*dt*friction
)
;
const
mixed
c2_0
=
sqrt
(
1.0f-c1_0*c1_0
)
;
__local
mixed4
v[2*THREAD_BLOCK_SIZE]
;
__local
mixed4
temp[2*THREAD_BLOCK_SIZE]
;
__local
mixed2
w[NUM_COPIES]
;
__local
mixed4*
vreal
=
&v[blockStart]
;
__local
mixed4*
vimag
=
&v[blockStart+get_local_size
(
0
)
]
;
if
(
get_local_id
(
0
)
<
NUM_COPIES
)
w[indexInBlock]
=
(
float
2
)
(
cos
(
-indexInBlock*2*M_PI/NUM_COPIES
)
,
sin
(
-indexInBlock*2*M_PI/NUM_COPIES
))
;
w[indexInBlock]
=
(
mixed
2
)
(
cos
(
-indexInBlock*2*M_PI/NUM_COPIES
)
,
sin
(
-indexInBlock*2*M_PI/NUM_COPIES
))
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
randomIndex
+=
NUM_COPIES*
(
get_global_id
(
0
)
/NUM_COPIES
)
;
for
(
int
particle
=
get_global_id
(
0
)
/NUM_COPIES
; particle < NUM_ATOMS; particle += numBlocks) {
float
4
particleVelm
=
velm[particle+indexInBlock*PADDED_NUM_ATOMS]
;
float
invMass
=
particleVelm.w
;
float
c3_0
=
c2_0*
SQRT
(
nkT*invMass
)
;
mixed
4
particleVelm
=
velm[particle+indexInBlock*PADDED_NUM_ATOMS]
;
mixed
invMass
=
particleVelm.w
;
mixed
c3_0
=
c2_0*
sqrt
(
nkT*invMass
)
;
//
Forward
FFT.
vreal[indexInBlock]
=
SCALE*particleVelm
;
vimag[indexInBlock]
=
(
float
4
)
(
0.0f,
0.0f,
0.0f,
0.0f
)
;
vimag[indexInBlock]
=
(
mixed
4
)
(
0.0f,
0.0f,
0.0f,
0.0f
)
;
barrier
(
CLK_GLOBAL_MEM_FENCE
)
;
FFT_V_FORWARD
...
...
@@ -49,19 +52,19 @@ __kernel void applyPileThermostat(__global float4* velm, __local float4* v, __lo
if
(
indexInBlock
==
0
)
{
//
Apply
a
local
Langevin
thermostat
to
the
centroid
mode.
vreal[0].xyz
=
vreal[0].xyz*c1_0
+
c3_0*random[randomIndex].xyz
;
vreal[0].xyz
=
vreal[0].xyz*c1_0
+
c3_0*
convert_mixed4
(
random[randomIndex]
)
.
xyz
;
}
else
{
//
Use
critical
damping
white
noise
for
the
remaining
modes.
int
k
=
(
indexInBlock
<=
NUM_COPIES/2
?
indexInBlock
:
NUM_COPIES-indexInBlock
)
;
const
bool
isCenter
=
(
NUM_COPIES%2
==
0
&&
k
==
NUM_COPIES/2
)
;
const
float
wk
=
twown*sin
(
k*M_PI/NUM_COPIES
)
;
const
float
c1
=
EXP
(
-wk*dt
)
;
const
float
c2
=
SQRT
((
1.0f-c1*c1
)
/2.0f
)
*
(
isCenter
?
sqrt
(
2.0f
)
:
1.0f
)
;
const
float
c3
=
c2*
SQRT
(
nkT*invMass
)
;
float
4
rand1
=
c3*random[randomIndex+k]
;
float
4
rand2
=
(
isCenter
?
0.0f
:
c3*random[randomIndex+NUM_COPIES-k]
)
;
const
mixed
wk
=
twown*sin
(
k*M_PI/NUM_COPIES
)
;
const
mixed
c1
=
exp
(
-wk*dt
)
;
const
mixed
c2
=
sqrt
((
1.0f-c1*c1
)
/2.0f
)
*
(
isCenter
?
sqrt
(
2.0f
)
:
1.0f
)
;
const
mixed
c3
=
c2*
sqrt
(
nkT*invMass
)
;
mixed
4
rand1
=
c3*
convert_mixed4
(
random[randomIndex+k]
)
;
mixed
4
rand2
=
(
isCenter
?
0.0f
:
c3*
convert_mixed4
(
random[randomIndex+NUM_COPIES-k]
)
)
;
vreal[indexInBlock].xyz
=
c1*vreal[indexInBlock].xyz
+
rand1.xyz
;
vimag[indexInBlock].xyz
=
c1*vimag[indexInBlock].xyz
+
(
indexInBlock
<
NUM_COPIES/2
?
rand2.xyz
:
-rand2.xyz
)
;
}
...
...
@@ -78,42 +81,45 @@ __kernel void applyPileThermostat(__global float4* velm, __local float4* v, __lo
/**
*
Advance
the
positions
and
velocities.
*/
__kernel
void
integrateStep
(
__global
float4*
posq,
__global
float4*
velm,
__global
float4*
force,
__local
float4*
q,
__local
float4*
v,
__local
float4*
temp,
__local
float2*
w,
float
dt,
float
kT
)
{
__kernel
void
integrateStep
(
__global
mixed4*
posq,
__global
mixed4*
velm,
__global
real4*
force,
mixed
dt,
mixed
kT
)
{
const
int
numBlocks
=
get_global_size
(
0
)
/NUM_COPIES
;
const
int
blockStart
=
NUM_COPIES*
(
get_local_id
(
0
)
/NUM_COPIES
)
;
const
int
indexInBlock
=
get_local_id
(
0
)
-blockStart
;
const
float
nkT
=
NUM_COPIES*kT
;
const
float
twown
=
2.0f*nkT/HBAR
;
const
mixed
nkT
=
NUM_COPIES*kT
;
const
mixed
twown
=
2.0f*nkT/HBAR
;
__local
mixed4
q[2*THREAD_BLOCK_SIZE]
;
__local
mixed4
v[2*THREAD_BLOCK_SIZE]
;
__local
mixed4
temp[2*THREAD_BLOCK_SIZE]
;
__local
mixed2
w[NUM_COPIES]
;
//
Update
velocities.
for
(
int
particle
=
get_global_id
(
0
)
/NUM_COPIES
; particle < NUM_ATOMS; particle += numBlocks) {
int
index
=
particle+indexInBlock*PADDED_NUM_ATOMS
;
float
4
particleVelm
=
velm[index]
;
particleVelm.xyz
+=
force[index].xyz*
(
0.5f*dt*particleVelm.w
)
;
mixed
4
particleVelm
=
velm[index]
;
particleVelm.xyz
+=
convert_mixed4
(
force[index]
)
.
xyz*
(
0.5f*dt*particleVelm.w
)
;
velm[index]
=
particleVelm
;
}
//
Evolve
the
free
ring
polymer
by
transforming
to
the
frequency
domain.
__local
float
4*
qreal
=
&q[blockStart]
;
__local
float
4*
qimag
=
&q[blockStart+get_local_size
(
0
)
]
;
__local
float
4*
vreal
=
&v[blockStart]
;
__local
float
4*
vimag
=
&v[blockStart+get_local_size
(
0
)
]
;
__local
mixed
4*
qreal
=
&q[blockStart]
;
__local
mixed
4*
qimag
=
&q[blockStart+get_local_size
(
0
)
]
;
__local
mixed
4*
vreal
=
&v[blockStart]
;
__local
mixed
4*
vimag
=
&v[blockStart+get_local_size
(
0
)
]
;
if
(
get_local_id
(
0
)
<
NUM_COPIES
)
w[indexInBlock]
=
(
float
2
)
(
cos
(
-indexInBlock*2*M_PI/NUM_COPIES
)
,
sin
(
-indexInBlock*2*M_PI/NUM_COPIES
))
;
w[indexInBlock]
=
(
mixed
2
)
(
cos
(
-indexInBlock*2*M_PI/NUM_COPIES
)
,
sin
(
-indexInBlock*2*M_PI/NUM_COPIES
))
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
particle
=
get_global_id
(
0
)
/NUM_COPIES
; particle < NUM_ATOMS; particle += numBlocks) {
float
4
particlePosq
=
posq[particle+indexInBlock*PADDED_NUM_ATOMS]
;
float
4
particleVelm
=
velm[particle+indexInBlock*PADDED_NUM_ATOMS]
;
mixed
4
particlePosq
=
posq[particle+indexInBlock*PADDED_NUM_ATOMS]
;
mixed
4
particleVelm
=
velm[particle+indexInBlock*PADDED_NUM_ATOMS]
;
//
Forward
FFT.
qreal[indexInBlock]
=
SCALE*particlePosq
;
qimag[indexInBlock]
=
(
float
4
)
(
0.0f,
0.0f,
0.0f,
0.0f
)
;
qimag[indexInBlock]
=
(
mixed
4
)
(
0.0f,
0.0f,
0.0f,
0.0f
)
;
vreal[indexInBlock]
=
SCALE*particleVelm
;
vimag[indexInBlock]
=
(
float
4
)
(
0.0f,
0.0f,
0.0f,
0.0f
)
;
vimag[indexInBlock]
=
(
mixed
4
)
(
0.0f,
0.0f,
0.0f,
0.0f
)
;
barrier
(
CLK_GLOBAL_MEM_FENCE
)
;
FFT_Q_FORWARD
FFT_V_FORWARD
...
...
@@ -125,12 +131,12 @@ __kernel void integrateStep(__global float4* posq, __global float4* velm, __glob
qimag[0].xyz
+=
vimag[0].xyz*dt
;
}
else
{
const
float
wk
=
twown*sin
(
indexInBlock*M_PI/NUM_COPIES
)
;
const
float
wt
=
wk*dt
;
const
float
coswt
=
cos
(
wt
)
;
const
float
sinwt
=
sin
(
wt
)
;
const
float
4
vprimereal
=
vreal[indexInBlock]*coswt
-
qreal[indexInBlock]*
(
wk*sinwt
)
; // Advance velocity from t to t+dt
const
float
4
vprimeimag
=
vimag[indexInBlock]*coswt
-
qimag[indexInBlock]*
(
wk*sinwt
)
;
const
mixed
wk
=
twown*sin
(
indexInBlock*M_PI/NUM_COPIES
)
;
const
mixed
wt
=
wk*dt
;
const
mixed
coswt
=
cos
(
wt
)
;
const
mixed
sinwt
=
sin
(
wt
)
;
const
mixed
4
vprimereal
=
vreal[indexInBlock]*coswt
-
qreal[indexInBlock]*
(
wk*sinwt
)
; // Advance velocity from t to t+dt
const
mixed
4
vprimeimag
=
vimag[indexInBlock]*coswt
-
qimag[indexInBlock]*
(
wk*sinwt
)
;
qreal[indexInBlock]
=
vreal[indexInBlock]*
(
sinwt/wk
)
+
qreal[indexInBlock]*coswt
; // Advance position from t to t+dt
qimag[indexInBlock]
=
vimag[indexInBlock]*
(
sinwt/wk
)
+
qimag[indexInBlock]*coswt
;
vreal[indexInBlock]
=
vprimereal
;
...
...
@@ -150,7 +156,7 @@ __kernel void integrateStep(__global float4* posq, __global float4* velm, __glob
/**
*
Advance
the
velocities
by
a
half
step.
*/
__kernel
void
advanceVelocities
(
__global
float
4*
velm,
__global
float
4*
force,
float
dt
)
{
__kernel
void
advanceVelocities
(
__global
mixed
4*
velm,
__global
real
4*
force,
mixed
dt
)
{
const
int
numBlocks
=
get_global_size
(
0
)
/NUM_COPIES
;
const
int
blockStart
=
NUM_COPIES*
(
get_local_id
(
0
)
/NUM_COPIES
)
;
const
int
indexInBlock
=
get_local_id
(
0
)
-blockStart
;
...
...
@@ -159,16 +165,26 @@ __kernel void advanceVelocities(__global float4* velm, __global float4* force, f
for
(
int
particle
=
get_global_id
(
0
)
/NUM_COPIES
; particle < NUM_ATOMS; particle += numBlocks) {
int
index
=
particle+indexInBlock*PADDED_NUM_ATOMS
;
float
4
particleVelm
=
velm[index]
;
particleVelm.xyz
+=
force[index].xyz*
(
0.5f*dt*particleVelm.w
)
;
mixed
4
particleVelm
=
velm[index]
;
particleVelm.xyz
+=
convert_mixed4
(
force[index]
)
.
xyz*
(
0.5f*dt*particleVelm.w
)
;
velm[index]
=
particleVelm
;
}
}
/**
*
Copy
a
set
of
p
er-atom
value
s
from
the
integrator
's
arrays
to
the
context.
*
Copy
a
set
of
p
osition
s
from
the
integrator
's
arrays
to
the
context.
*/
__kernel
void
copyToContext
(
__global
float4*
src,
__global
float4*
dst,
__global
int*
order,
int
copy
)
{
__kernel
void
copyPositionsToContext
(
__global
mixed4*
src,
__global
real4*
dst,
__global
int*
order,
int
copy
)
{
const
int
base
=
copy*PADDED_NUM_ATOMS
;
for
(
int
particle
=
get_global_id
(
0
)
; particle < NUM_ATOMS; particle += get_global_size(0)) {
dst[particle]
=
convert_real4
(
src[base+order[particle]]
)
;
}
}
/**
*
Copy
a
set
of
velocities
from
the
integrator
's
arrays
to
the
context.
*/
__kernel
void
copyVelocitiesToContext
(
__global
mixed4*
src,
__global
mixed4*
dst,
__global
int*
order,
int
copy
)
{
const
int
base
=
copy*PADDED_NUM_ATOMS
;
for
(
int
particle
=
get_global_id
(
0
)
; particle < NUM_ATOMS; particle += get_global_size(0)) {
dst[particle]
=
src[base+order[particle]]
;
...
...
@@ -176,9 +192,9 @@ __kernel void copyToContext(__global float4* src, __global float4* dst, __global
}
/**
*
Copy
a
set
of
per-atom
valu
es
from
the
context
to
the
integrator
's
arrays.
*
Copy
a
set
forc
es
from
the
context
to
the
integrator
's
arrays.
*/
__kernel
void
copyFromContext
(
__global
float
4*
src,
__global
float
4*
dst,
__global
int*
order,
int
copy
)
{
__kernel
void
copyF
orcesF
romContext
(
__global
real
4*
src,
__global
real
4*
dst,
__global
int*
order,
int
copy
)
{
const
int
base
=
copy*PADDED_NUM_ATOMS
;
for
(
int
particle
=
get_global_id
(
0
)
; particle < NUM_ATOMS; particle += get_global_size(0)) {
dst[base+order[particle]]
=
src[particle]
;
...
...
@@ -188,10 +204,10 @@ __kernel void copyFromContext(__global float4* src, __global float4* dst, __glob
/**
*
Update
atom
positions
so
all
copies
are
offset
by
the
same
number
of
periodic
box
widths.
*/
__kernel
void
applyCellTranslations
(
__global
float
4*
posq,
__global
float
4*
movedPos,
__global
int*
order,
int
movedCopy
)
{
__kernel
void
applyCellTranslations
(
__global
mixed
4*
posq,
__global
real
4*
movedPos,
__global
int*
order,
int
movedCopy
)
{
for
(
int
particle
=
get_global_id
(
0
)
; particle < NUM_ATOMS; particle += get_global_size(0)) {
int
index
=
order[particle]
;
float
4
delta
=
movedPos[particle]-posq[movedCopy*PADDED_NUM_ATOMS+index]
;
mixed
4
delta
=
convert_mixed4
(
movedPos[particle]
)
-posq[movedCopy*PADDED_NUM_ATOMS+index]
;
for
(
int
copy
=
0
; copy < NUM_COPIES; copy++)
posq[copy*PADDED_NUM_ATOMS+index]
+=
delta
;
}
...
...
plugins/rpmd/platforms/opencl/tests/CMakeLists.txt
View file @
cd6af26e
...
...
@@ -14,6 +14,10 @@ FOREACH(TEST_PROG ${TEST_PROGS})
# Link with shared library
ADD_EXECUTABLE
(
${
TEST_ROOT
}
${
TEST_PROG
}
)
TARGET_LINK_LIBRARIES
(
${
TEST_ROOT
}
${
SHARED_RPMD_TARGET
}
)
ADD_TEST
(
${
TEST_ROOT
}
${
EXECUTABLE_OUTPUT_PATH
}
/
${
TEST_ROOT
}
)
ADD_TEST
(
${
TEST_ROOT
}
Single
${
EXECUTABLE_OUTPUT_PATH
}
/
${
TEST_ROOT
}
single
)
IF
(
OPENMM_BUILD_CUDA_DOUBLE_PRECISION_TESTS
)
ADD_TEST
(
${
TEST_ROOT
}
Mixed
${
EXECUTABLE_OUTPUT_PATH
}
/
${
TEST_ROOT
}
mixed
)
ADD_TEST
(
${
TEST_ROOT
}
Double
${
EXECUTABLE_OUTPUT_PATH
}
/
${
TEST_ROOT
}
double
)
ENDIF
(
OPENMM_BUILD_CUDA_DOUBLE_PRECISION_TESTS
)
ENDFOREACH
(
TEST_PROG
${
TEST_PROGS
}
)
plugins/rpmd/platforms/opencl/tests/TestOpenCLRpmd.cpp
View file @
cd6af26e
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011 Stanford University and the Authors.
*
* Portions copyright (c) 2011
-2012
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -221,9 +221,11 @@ void testParaHydrogen() {
ASSERT_USUALLY_EQUAL_TOL
(
60.0
,
1.5
*
temperature
+
meanKE
,
0.02
);
}
int
main
()
{
int
main
(
int
argc
,
char
*
argv
[]
)
{
try
{
Platform
::
loadPluginsFromDirectory
(
Platform
::
getDefaultPluginsDirectory
());
if
(
argc
>
1
)
Platform
::
getPlatformByName
(
"OpenCL"
).
setPropertyDefaultValue
(
"OpenCLPrecision"
,
string
(
argv
[
1
]));
testFreeParticles
();
testParaHydrogen
();
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment