Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
8d6a2a01
Commit
8d6a2a01
authored
Oct 16, 2012
by
Peter Eastman
Browse files
Beginnings of mixed/double precision support in OpenCL
parent
a3d5f834
Changes
26
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1299 additions
and
616 deletions
+1299
-616
platforms/opencl/include/OpenCLPlatform.h
platforms/opencl/include/OpenCLPlatform.h
+8
-1
platforms/opencl/src/OpenCLContext.cpp
platforms/opencl/src/OpenCLContext.cpp
+199
-73
platforms/opencl/src/OpenCLContext.h
platforms/opencl/src/OpenCLContext.h
+57
-5
platforms/opencl/src/OpenCLIntegrationUtilities.cpp
platforms/opencl/src/OpenCLIntegrationUtilities.cpp
+133
-58
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+474
-184
platforms/opencl/src/OpenCLKernels.h
platforms/opencl/src/OpenCLKernels.h
+4
-1
platforms/opencl/src/OpenCLParameterSet.cpp
platforms/opencl/src/OpenCLParameterSet.cpp
+55
-37
platforms/opencl/src/OpenCLParameterSet.h
platforms/opencl/src/OpenCLParameterSet.h
+8
-6
platforms/opencl/src/OpenCLPlatform.cpp
platforms/opencl/src/OpenCLPlatform.cpp
+10
-4
platforms/opencl/src/kernels/andersenThermostat.cl
platforms/opencl/src/kernels/andersenThermostat.cl
+7
-5
platforms/opencl/src/kernels/brownian.cl
platforms/opencl/src/kernels/brownian.cl
+30
-10
platforms/opencl/src/kernels/ccma.cl
platforms/opencl/src/kernels/ccma.cl
+35
-26
platforms/opencl/src/kernels/constraints.cl
platforms/opencl/src/kernels/constraints.cl
+15
-4
platforms/opencl/src/kernels/customIntegrator.cl
platforms/opencl/src/kernels/customIntegrator.cl
+36
-7
platforms/opencl/src/kernels/customIntegratorGlobal.cl
platforms/opencl/src/kernels/customIntegratorGlobal.cl
+2
-2
platforms/opencl/src/kernels/customIntegratorPerDof.cl
platforms/opencl/src/kernels/customIntegratorPerDof.cl
+33
-23
platforms/opencl/src/kernels/ewald.cl
platforms/opencl/src/kernels/ewald.cl
+0
-1
platforms/opencl/src/kernels/langevin.cl
platforms/opencl/src/kernels/langevin.cl
+38
-29
platforms/opencl/src/kernels/removeCM.cl
platforms/opencl/src/kernels/removeCM.cl
+11
-6
platforms/opencl/src/kernels/settle.cl
platforms/opencl/src/kernels/settle.cl
+144
-134
No files found.
platforms/opencl/include/OpenCLPlatform.h
View file @
8d6a2a01
...
...
@@ -68,11 +68,18 @@ public:
static
const
std
::
string
key
=
"OpenCLPlatformIndex"
;
return
key
;
}
/**
* This is the name of the parameter for selecting what numerical precision to use.
*/
static
const
std
::
string
&
OpenCLPrecision
()
{
static
const
std
::
string
key
=
"OpenCLPrecision"
;
return
key
;
}
};
class
OPENMM_EXPORT
OpenCLPlatform
::
PlatformData
{
public:
PlatformData
(
const
System
&
system
,
const
std
::
string
&
platformPropValue
,
const
std
::
string
&
deviceIndexProperty
);
PlatformData
(
const
System
&
system
,
const
std
::
string
&
platformPropValue
,
const
std
::
string
&
deviceIndexProperty
,
const
std
::
string
&
precisionProperty
);
~
PlatformData
();
void
initializeContexts
(
const
System
&
system
);
void
syncContexts
();
...
...
platforms/opencl/src/OpenCLContext.cpp
View file @
8d6a2a01
...
...
@@ -65,10 +65,24 @@ static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_i
std
::
cerr
<<
"OpenCL internal error: "
<<
errinfo
<<
std
::
endl
;
}
OpenCLContext
::
OpenCLContext
(
const
System
&
system
,
int
platformIndex
,
int
deviceIndex
,
OpenCLPlatform
::
PlatformData
&
platformData
)
:
OpenCLContext
::
OpenCLContext
(
const
System
&
system
,
int
platformIndex
,
int
deviceIndex
,
const
string
&
precision
,
OpenCLPlatform
::
PlatformData
&
platformData
)
:
system
(
system
),
time
(
0.0
),
platformData
(
platformData
),
stepCount
(
0
),
computeForceCount
(
0
),
atomsWereReordered
(
false
),
posq
(
NULL
),
velm
(
NULL
),
forceBuffers
(
NULL
),
longForceBuffer
(
NULL
),
energyBuffer
(
NULL
),
atomIndexDevice
(
NULL
),
integration
(
NULL
),
bonded
(
NULL
),
nonbonded
(
NULL
),
thread
(
NULL
)
{
if
(
precision
==
"single"
)
{
useDoublePrecision
=
false
;
useMixedPrecision
=
false
;
}
else
if
(
precision
==
"mixed"
)
{
useDoublePrecision
=
false
;
useMixedPrecision
=
true
;
}
else
if
(
precision
==
"double"
)
{
useDoublePrecision
=
true
;
useMixedPrecision
=
false
;
}
else
throw
OpenMMException
(
"Illegal value for OpenCLPrecision: "
+
precision
);
try
{
contextIndex
=
platformData
.
contexts
.
size
();
std
::
vector
<
cl
::
Platform
>
platforms
;
...
...
@@ -217,8 +231,27 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
numThreadBlocks
=
numThreadBlocksPerComputeUnit
*
device
.
getInfo
<
CL_DEVICE_MAX_COMPUTE_UNITS
>
();
bonded
=
new
OpenCLBondedUtilities
(
*
this
);
nonbonded
=
new
OpenCLNonbondedUtilities
(
*
this
);
posq
=
OpenCLArray
::
create
<
mm_float4
>
(
*
this
,
paddedNumAtoms
,
"posq"
);
velm
=
OpenCLArray
::
create
<
mm_float4
>
(
*
this
,
paddedNumAtoms
,
"velm"
);
if
(
useDoublePrecision
)
{
posq
=
OpenCLArray
::
create
<
mm_double4
>
(
*
this
,
paddedNumAtoms
,
"posq"
);
velm
=
OpenCLArray
::
create
<
mm_double4
>
(
*
this
,
paddedNumAtoms
,
"velm"
);
compilationDefines
[
"USE_DOUBLE_PRECISION"
]
=
"1"
;
compilationDefines
[
"convert_real4"
]
=
"convert_double4"
;
compilationDefines
[
"convert_mixed4"
]
=
"convert_double4"
;
}
else
if
(
useMixedPrecision
)
{
posq
=
OpenCLArray
::
create
<
mm_float4
>
(
*
this
,
paddedNumAtoms
,
"posq"
);
posqCorrection
=
OpenCLArray
::
create
<
mm_float4
>
(
*
this
,
paddedNumAtoms
,
"posq"
);
velm
=
OpenCLArray
::
create
<
mm_double4
>
(
*
this
,
paddedNumAtoms
,
"velm"
);
compilationDefines
[
"USE_MIXED_PRECISION"
]
=
"1"
;
compilationDefines
[
"convert_real4"
]
=
"convert_float4"
;
compilationDefines
[
"convert_mixed4"
]
=
"convert_double4"
;
}
else
{
posq
=
OpenCLArray
::
create
<
mm_float4
>
(
*
this
,
paddedNumAtoms
,
"posq"
);
velm
=
OpenCLArray
::
create
<
mm_float4
>
(
*
this
,
paddedNumAtoms
,
"velm"
);
compilationDefines
[
"convert_real4"
]
=
"convert_float4"
;
compilationDefines
[
"convert_mixed4"
]
=
"convert_float4"
;
}
posCellOffsets
.
resize
(
paddedNumAtoms
,
mm_int4
(
0
,
0
,
0
,
0
));
}
catch
(
cl
::
Error
err
)
{
...
...
@@ -241,34 +274,43 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
// Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.
cl
::
Kernel
accuracyKernel
(
utilities
,
"determineNativeAccuracy"
);
OpenCLArray
valuesArray
(
*
this
,
20
,
sizeof
(
mm_float8
),
"values"
);
vector
<
mm_float8
>
values
(
valuesArray
.
getSize
());
float
nextValue
=
1e-4
f
;
for
(
int
i
=
0
;
i
<
(
int
)
values
.
size
();
++
i
)
{
values
[
i
].
s0
=
nextValue
;
nextValue
*=
(
float
)
M_PI
;
if
(
!
useDoublePrecision
)
{
cl
::
Kernel
accuracyKernel
(
utilities
,
"determineNativeAccuracy"
);
OpenCLArray
valuesArray
(
*
this
,
20
,
sizeof
(
mm_float8
),
"values"
);
vector
<
mm_float8
>
values
(
valuesArray
.
getSize
());
float
nextValue
=
1e-4
f
;
for
(
int
i
=
0
;
i
<
(
int
)
values
.
size
();
++
i
)
{
values
[
i
].
s0
=
nextValue
;
nextValue
*=
(
float
)
M_PI
;
}
valuesArray
.
upload
(
values
);
accuracyKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
valuesArray
.
getDeviceBuffer
());
accuracyKernel
.
setArg
<
cl_int
>
(
1
,
values
.
size
());
executeKernel
(
accuracyKernel
,
values
.
size
());
valuesArray
.
download
(
values
);
double
maxSqrtError
=
0.0
,
maxRsqrtError
=
0.0
,
maxRecipError
=
0.0
,
maxExpError
=
0.0
,
maxLogError
=
0.0
;
for
(
int
i
=
0
;
i
<
(
int
)
values
.
size
();
++
i
)
{
double
v
=
values
[
i
].
s0
;
double
correctSqrt
=
sqrt
(
v
);
maxSqrtError
=
max
(
maxSqrtError
,
fabs
(
correctSqrt
-
values
[
i
].
s1
)
/
correctSqrt
);
maxRsqrtError
=
max
(
maxRsqrtError
,
fabs
(
1.0
/
correctSqrt
-
values
[
i
].
s2
)
*
correctSqrt
);
maxRecipError
=
max
(
maxRecipError
,
fabs
(
1.0
/
v
-
values
[
i
].
s3
)
/
values
[
i
].
s3
);
maxExpError
=
max
(
maxExpError
,
fabs
(
exp
(
v
)
-
values
[
i
].
s4
)
/
values
[
i
].
s4
);
maxLogError
=
max
(
maxLogError
,
fabs
(
log
(
v
)
-
values
[
i
].
s5
)
/
values
[
i
].
s5
);
}
compilationDefines
[
"SQRT"
]
=
(
maxSqrtError
<
1e-6
)
?
"native_sqrt"
:
"sqrt"
;
compilationDefines
[
"RSQRT"
]
=
(
maxRsqrtError
<
1e-6
)
?
"native_rsqrt"
:
"rsqrt"
;
compilationDefines
[
"RECIP"
]
=
(
maxRecipError
<
1e-6
)
?
"native_recip"
:
"1.0f/"
;
compilationDefines
[
"EXP"
]
=
(
maxExpError
<
1e-6
)
?
"native_exp"
:
"exp"
;
compilationDefines
[
"LOG"
]
=
(
maxLogError
<
1e-6
)
?
"native_log"
:
"log"
;
}
valuesArray
.
upload
(
values
);
accuracyKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
valuesArray
.
getDeviceBuffer
());
accuracyKernel
.
setArg
<
cl_int
>
(
1
,
values
.
size
());
executeKernel
(
accuracyKernel
,
values
.
size
());
valuesArray
.
download
(
values
);
double
maxSqrtError
=
0.0
,
maxRsqrtError
=
0.0
,
maxRecipError
=
0.0
,
maxExpError
=
0.0
,
maxLogError
=
0.0
;
for
(
int
i
=
0
;
i
<
(
int
)
values
.
size
();
++
i
)
{
double
v
=
values
[
i
].
s0
;
double
correctSqrt
=
sqrt
(
v
);
maxSqrtError
=
max
(
maxSqrtError
,
fabs
(
correctSqrt
-
values
[
i
].
s1
)
/
correctSqrt
);
maxRsqrtError
=
max
(
maxRsqrtError
,
fabs
(
1.0
/
correctSqrt
-
values
[
i
].
s2
)
*
correctSqrt
);
maxRecipError
=
max
(
maxRecipError
,
fabs
(
1.0
/
v
-
values
[
i
].
s3
)
/
values
[
i
].
s3
);
maxExpError
=
max
(
maxExpError
,
fabs
(
exp
(
v
)
-
values
[
i
].
s4
)
/
values
[
i
].
s4
);
maxLogError
=
max
(
maxLogError
,
fabs
(
log
(
v
)
-
values
[
i
].
s5
)
/
values
[
i
].
s5
);
else
{
compilationDefines
[
"SQRT"
]
=
"sqrt"
;
compilationDefines
[
"RSQRT"
]
=
"rsqrt"
;
compilationDefines
[
"RECIP"
]
=
"1.0/"
;
compilationDefines
[
"EXP"
]
=
"exp"
;
compilationDefines
[
"LOG"
]
=
"log"
;
}
compilationDefines
[
"SQRT"
]
=
(
maxSqrtError
<
1e-6
)
?
"native_sqrt"
:
"sqrt"
;
compilationDefines
[
"RSQRT"
]
=
(
maxRsqrtError
<
1e-6
)
?
"native_rsqrt"
:
"rsqrt"
;
compilationDefines
[
"RECIP"
]
=
(
maxRecipError
<
1e-6
)
?
"native_recip"
:
"1.0f/"
;
compilationDefines
[
"EXP"
]
=
(
maxExpError
<
1e-6
)
?
"native_exp"
:
"exp"
;
compilationDefines
[
"LOG"
]
=
(
maxLogError
<
1e-6
)
?
"native_log"
:
"log"
;
// Create the work thread used for parallelization when running on multiple devices.
...
...
@@ -311,18 +353,21 @@ OpenCLContext::~OpenCLContext() {
}
void
OpenCLContext
::
initialize
()
{
vector
<
mm_float4
>
v
(
paddedNumAtoms
,
mm_float4
(
0
,
0
,
0
,
0
));
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
double
mass
=
system
.
getParticleMass
(
i
);
v
[
i
].
w
=
(
float
)
(
mass
==
0.0
?
0.0
:
1.0
/
mass
);
}
velm
->
upload
(
v
);
bonded
->
initialize
(
system
);
numForceBuffers
=
platformData
.
contexts
.
size
();
numForceBuffers
=
std
::
max
(
numForceBuffers
,
bonded
->
getNumForceBuffers
());
for
(
int
i
=
0
;
i
<
(
int
)
forces
.
size
();
i
++
)
numForceBuffers
=
std
::
max
(
numForceBuffers
,
forces
[
i
]
->
getRequiredForceBuffers
());
forceBuffers
=
OpenCLArray
::
create
<
mm_float4
>
(
*
this
,
paddedNumAtoms
*
numForceBuffers
,
"forceBuffers"
);
if
(
useDoublePrecision
)
{
forceBuffers
=
OpenCLArray
::
create
<
mm_double4
>
(
*
this
,
paddedNumAtoms
*
numForceBuffers
,
"forceBuffers"
);
force
=
OpenCLArray
::
create
<
mm_double4
>
(
*
this
,
&
forceBuffers
->
getDeviceBuffer
(),
paddedNumAtoms
,
"force"
);
energyBuffer
=
OpenCLArray
::
create
<
cl_double
>
(
*
this
,
max
(
numThreadBlocks
*
ThreadBlockSize
,
nonbonded
->
getNumEnergyBuffers
()),
"energyBuffer"
);
}
else
{
forceBuffers
=
OpenCLArray
::
create
<
mm_float4
>
(
*
this
,
paddedNumAtoms
*
numForceBuffers
,
"forceBuffers"
);
force
=
OpenCLArray
::
create
<
mm_float4
>
(
*
this
,
&
forceBuffers
->
getDeviceBuffer
(),
paddedNumAtoms
,
"force"
);
energyBuffer
=
OpenCLArray
::
create
<
cl_float
>
(
*
this
,
max
(
numThreadBlocks
*
ThreadBlockSize
,
nonbonded
->
getNumEnergyBuffers
()),
"energyBuffer"
);
}
if
(
supports64BitGlobalAtomics
)
{
longForceBuffer
=
OpenCLArray
::
create
<
cl_long
>
(
*
this
,
3
*
paddedNumAtoms
,
"longForceBuffer"
);
reduceForcesKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
longForceBuffer
->
getDeviceBuffer
());
...
...
@@ -332,12 +377,18 @@ void OpenCLContext::initialize() {
addAutoclearBuffer
(
longForceBuffer
->
getDeviceBuffer
(),
longForceBuffer
->
getSize
()
*
2
);
}
addAutoclearBuffer
(
forceBuffers
->
getDeviceBuffer
(),
forceBuffers
->
getSize
()
*
4
);
force
=
OpenCLArray
::
create
<
mm_float4
>
(
*
this
,
&
forceBuffers
->
getDeviceBuffer
(),
paddedNumAtoms
,
"force"
);
energyBuffer
=
OpenCLArray
::
create
<
cl_float
>
(
*
this
,
max
(
numThreadBlocks
*
ThreadBlockSize
,
nonbonded
->
getNumEnergyBuffers
()),
"energyBuffer"
);
addAutoclearBuffer
(
energyBuffer
->
getDeviceBuffer
(),
energyBuffer
->
getSize
());
int
bufferBytes
=
max
(
posq
->
getSize
()
*
sizeof
(
mm_float4
),
energyBuffer
->
getSize
()
*
sizeof
(
cl_float
));
int
bufferBytes
=
max
(
posq
->
getSize
()
*
posq
->
getElementSize
(
),
energyBuffer
->
getSize
()
*
energyBuffer
->
getElementSize
(
));
pinnedBuffer
=
new
cl
::
Buffer
(
context
,
CL_MEM_ALLOC_HOST_PTR
,
bufferBytes
);
pinnedMemory
=
queue
.
enqueueMapBuffer
(
*
pinnedBuffer
,
CL_TRUE
,
CL_MAP_READ
|
CL_MAP_WRITE
,
0
,
bufferBytes
);
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
double
mass
=
system
.
getParticleMass
(
i
);
if
(
useDoublePrecision
||
useMixedPrecision
)
((
mm_double4
*
)
pinnedMemory
)[
i
]
=
mm_double4
(
0.0
,
0.0
,
0.0
,
mass
==
0.0
?
0.0
:
1.0
/
mass
);
else
((
mm_float4
*
)
pinnedMemory
)[
i
]
=
mm_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
mass
==
0.0
?
0.0
f
:
(
cl_float
)
(
1.0
/
mass
));
}
velm
->
upload
(
pinnedMemory
);
atomIndexDevice
=
OpenCLArray
::
create
<
cl_int
>
(
*
this
,
paddedNumAtoms
,
"atomIndexDevice"
);
atomIndex
.
resize
(
paddedNumAtoms
);
for
(
int
i
=
0
;
i
<
paddedNumAtoms
;
++
i
)
...
...
@@ -382,6 +433,28 @@ cl::Program OpenCLContext::createProgram(const string source, const map<string,
}
if
(
!
compilationDefines
.
empty
())
src
<<
endl
;
if
(
supportsDoublePrecision
)
src
<<
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable
\n
"
;
if
(
useDoublePrecision
)
{
src
<<
"typedef double real;
\n
"
;
src
<<
"typedef double2 real2;
\n
"
;
src
<<
"typedef double4 real4;
\n
"
;
}
else
{
src
<<
"typedef float real;
\n
"
;
src
<<
"typedef float2 real2;
\n
"
;
src
<<
"typedef float4 real4;
\n
"
;
}
if
(
useDoublePrecision
||
useMixedPrecision
)
{
src
<<
"typedef double mixed;
\n
"
;
src
<<
"typedef double2 mixed2;
\n
"
;
src
<<
"typedef double4 mixed4;
\n
"
;
}
else
{
src
<<
"typedef float mixed;
\n
"
;
src
<<
"typedef float2 mixed2;
\n
"
;
src
<<
"typedef float4 mixed4;
\n
"
;
}
for
(
map
<
string
,
string
>::
const_iterator
iter
=
defines
.
begin
();
iter
!=
defines
.
end
();
++
iter
)
{
src
<<
"#define "
<<
iter
->
first
;
if
(
!
iter
->
second
.
empty
())
...
...
@@ -764,27 +837,62 @@ void OpenCLContext::validateMolecules() {
// atoms to their original order, rebuild the list of identical molecules, and sort them
// again.
vector
<
mm_float4
>
oldPosq
(
paddedNumAtoms
);
vector
<
mm_float4
>
newPosq
(
paddedNumAtoms
);
vector
<
mm_float4
>
oldVelm
(
paddedNumAtoms
);
vector
<
mm_float4
>
newVelm
(
paddedNumAtoms
);
vector
<
mm_int4
>
newCellOffsets
(
numAtoms
);
posq
->
download
(
oldPosq
);
velm
->
download
(
oldVelm
);
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
int
index
=
atomIndex
[
i
];
newPosq
[
index
]
=
oldPosq
[
i
];
newVelm
[
index
]
=
oldVelm
[
i
];
newCellOffsets
[
index
]
=
posCellOffsets
[
i
];
if
(
useDoublePrecision
)
{
vector
<
mm_double4
>
oldPosq
(
paddedNumAtoms
);
vector
<
mm_double4
>
newPosq
(
paddedNumAtoms
);
vector
<
mm_double4
>
oldVelm
(
paddedNumAtoms
);
vector
<
mm_double4
>
newVelm
(
paddedNumAtoms
);
posq
->
download
(
oldPosq
);
velm
->
download
(
oldVelm
);
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
int
index
=
atomIndex
[
i
];
newPosq
[
index
]
=
oldPosq
[
i
];
newVelm
[
index
]
=
oldVelm
[
i
];
newCellOffsets
[
index
]
=
posCellOffsets
[
i
];
}
posq
->
upload
(
newPosq
);
velm
->
upload
(
newVelm
);
}
else
if
(
useMixedPrecision
)
{
vector
<
mm_float4
>
oldPosq
(
paddedNumAtoms
);
vector
<
mm_float4
>
newPosq
(
paddedNumAtoms
);
vector
<
mm_float4
>
oldPosqCorrection
(
paddedNumAtoms
);
vector
<
mm_float4
>
newPosqCorrection
(
paddedNumAtoms
);
vector
<
mm_double4
>
oldVelm
(
paddedNumAtoms
);
vector
<
mm_double4
>
newVelm
(
paddedNumAtoms
);
posq
->
download
(
oldPosq
);
velm
->
download
(
oldVelm
);
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
int
index
=
atomIndex
[
i
];
newPosq
[
index
]
=
oldPosq
[
i
];
newPosqCorrection
[
index
]
=
oldPosqCorrection
[
i
];
newVelm
[
index
]
=
oldVelm
[
i
];
newCellOffsets
[
index
]
=
posCellOffsets
[
i
];
}
posq
->
upload
(
newPosq
);
velm
->
upload
(
newVelm
);
}
else
{
vector
<
mm_float4
>
oldPosq
(
paddedNumAtoms
);
vector
<
mm_float4
>
newPosq
(
paddedNumAtoms
);
vector
<
mm_float4
>
oldVelm
(
paddedNumAtoms
);
vector
<
mm_float4
>
newVelm
(
paddedNumAtoms
);
posq
->
download
(
oldPosq
);
velm
->
download
(
oldVelm
);
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
int
index
=
atomIndex
[
i
];
newPosq
[
index
]
=
oldPosq
[
i
];
newVelm
[
index
]
=
oldVelm
[
i
];
newCellOffsets
[
index
]
=
posCellOffsets
[
i
];
}
posq
->
upload
(
newPosq
);
velm
->
upload
(
newVelm
);
}
posq
->
upload
(
newPosq
);
velm
->
upload
(
newVelm
);
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
atomIndex
[
i
]
=
i
;
posCellOffsets
[
i
]
=
newCellOffsets
[
i
];
}
posq
->
upload
(
newPosq
);
velm
->
upload
(
newVelm
);
atomIndexDevice
->
upload
(
atomIndex
);
findMoleculeGroups
();
for
(
int
i
=
0
;
i
<
(
int
)
reorderListeners
.
size
();
i
++
)
...
...
@@ -797,16 +905,29 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
if
(
moleculesInvalid
)
validateMolecules
();
atomsWereReordered
=
true
;
if
(
useDoublePrecision
)
reorderAtomsImpl
<
cl_double
,
mm_double4
,
cl_double
,
mm_double4
>
(
enforcePeriodic
);
else
if
(
useMixedPrecision
)
reorderAtomsImpl
<
cl_float
,
mm_float4
,
cl_double
,
mm_double4
>
(
enforcePeriodic
);
else
reorderAtomsImpl
<
cl_float
,
mm_float4
,
cl_float
,
mm_float4
>
(
enforcePeriodic
);
}
template
<
class
Real
,
class
Real4
,
class
Mixed
,
class
Mixed4
>
void
OpenCLContext
::
reorderAtomsImpl
(
bool
enforcePeriodic
)
{
// Find the range of positions and the number of bins along each axis.
vector
<
mm_float4
>
oldPosq
(
paddedNumAtoms
);
vector
<
mm_float4
>
oldVelm
(
paddedNumAtoms
);
vector
<
Real4
>
oldPosq
(
paddedNumAtoms
);
vector
<
Real4
>
oldPosqCorrection
(
paddedNumAtoms
);
vector
<
Mixed4
>
oldVelm
(
paddedNumAtoms
);
posq
->
download
(
oldPosq
);
velm
->
download
(
oldVelm
);
float
minx
=
oldPosq
[
0
].
x
,
maxx
=
oldPosq
[
0
].
x
;
float
miny
=
oldPosq
[
0
].
y
,
maxy
=
oldPosq
[
0
].
y
;
float
minz
=
oldPosq
[
0
].
z
,
maxz
=
oldPosq
[
0
].
z
;
if
(
useMixedPrecision
)
posqCorrection
->
download
(
oldPosqCorrection
);
Real
minx
=
oldPosq
[
0
].
x
,
maxx
=
oldPosq
[
0
].
x
;
Real
miny
=
oldPosq
[
0
].
y
,
maxy
=
oldPosq
[
0
].
y
;
Real
minz
=
oldPosq
[
0
].
z
,
maxz
=
oldPosq
[
0
].
z
;
if
(
nonbonded
->
getUsePeriodic
())
{
minx
=
miny
=
minz
=
0.0
;
maxx
=
periodicBoxSize
.
x
;
...
...
@@ -815,7 +936,7 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
}
else
{
for
(
int
i
=
1
;
i
<
numAtoms
;
i
++
)
{
const
mm_float
4
&
pos
=
oldPosq
[
i
];
const
Real
4
&
pos
=
oldPosq
[
i
];
minx
=
min
(
minx
,
pos
.
x
);
maxx
=
max
(
maxx
,
pos
.
x
);
miny
=
min
(
miny
,
pos
.
y
);
...
...
@@ -828,8 +949,9 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
// Loop over each group of identical molecules and reorder them.
vector
<
int
>
originalIndex
(
numAtoms
);
vector
<
mm_float4
>
newPosq
(
paddedNumAtoms
);
vector
<
mm_float4
>
newVelm
(
paddedNumAtoms
);
vector
<
Real4
>
newPosq
(
paddedNumAtoms
);
vector
<
Real4
>
newPosqCorrection
(
paddedNumAtoms
);
vector
<
Mixed4
>
newVelm
(
paddedNumAtoms
);
vector
<
mm_int4
>
newCellOffsets
(
numAtoms
);
for
(
int
group
=
0
;
group
<
(
int
)
moleculeGroups
.
size
();
group
++
)
{
// Find the center of each molecule.
...
...
@@ -837,15 +959,15 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
MoleculeGroup
&
mol
=
moleculeGroups
[
group
];
int
numMolecules
=
mol
.
offsets
.
size
();
vector
<
int
>&
atoms
=
mol
.
atoms
;
vector
<
mm_float
4
>
molPos
(
numMolecules
);
float
invNumAtoms
=
1.0
f
/
atoms
.
size
();
vector
<
Real
4
>
molPos
(
numMolecules
);
Real
invNumAtoms
=
(
Real
)
(
1.0
/
atoms
.
size
()
)
;
for
(
int
i
=
0
;
i
<
numMolecules
;
i
++
)
{
molPos
[
i
].
x
=
0.0
f
;
molPos
[
i
].
y
=
0.0
f
;
molPos
[
i
].
z
=
0.0
f
;
for
(
int
j
=
0
;
j
<
(
int
)
atoms
.
size
();
j
++
)
{
int
atom
=
atoms
[
j
]
+
mol
.
offsets
[
i
];
const
mm_float
4
&
pos
=
oldPosq
[
atom
];
const
Real
4
&
pos
=
oldPosq
[
atom
];
molPos
[
i
].
x
+=
pos
.
x
;
molPos
[
i
].
y
+=
pos
.
y
;
molPos
[
i
].
z
+=
pos
.
z
;
...
...
@@ -861,9 +983,9 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
int
xcell
=
(
int
)
floor
(
molPos
[
i
].
x
*
invPeriodicBoxSize
.
x
);
int
ycell
=
(
int
)
floor
(
molPos
[
i
].
y
*
invPeriodicBoxSize
.
y
);
int
zcell
=
(
int
)
floor
(
molPos
[
i
].
z
*
invPeriodicBoxSize
.
z
);
float
dx
=
xcell
*
periodicBoxSize
.
x
;
float
dy
=
ycell
*
periodicBoxSize
.
y
;
float
dz
=
zcell
*
periodicBoxSize
.
z
;
Real
dx
=
xcell
*
periodicBoxSize
.
x
;
Real
dy
=
ycell
*
periodicBoxSize
.
y
;
Real
dz
=
zcell
*
periodicBoxSize
.
z
;
if
(
dx
!=
0.0
f
||
dy
!=
0.0
f
||
dz
!=
0.0
f
)
{
molPos
[
i
].
x
-=
dx
;
molPos
[
i
].
y
-=
dy
;
...
...
@@ -871,7 +993,7 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
if
(
enforcePeriodic
)
{
for
(
int
j
=
0
;
j
<
(
int
)
atoms
.
size
();
j
++
)
{
int
atom
=
atoms
[
j
]
+
mol
.
offsets
[
i
];
mm_float
4
p
=
oldPosq
[
atom
];
Real
4
p
=
oldPosq
[
atom
];
p
.
x
-=
dx
;
p
.
y
-=
dy
;
p
.
z
-=
dz
;
...
...
@@ -888,12 +1010,12 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
// Select a bin for each molecule, then sort them by bin.
bool
useHilbert
=
(
numMolecules
>
5000
||
atoms
.
size
()
>
8
);
// For small systems, a simple zigzag curve works better than a Hilbert curve.
float
binWidth
;
Real
binWidth
;
if
(
useHilbert
)
binWidth
=
(
float
)
(
max
(
max
(
maxx
-
minx
,
maxy
-
miny
),
maxz
-
minz
)
/
255.0
);
binWidth
=
(
Real
)
(
max
(
max
(
maxx
-
minx
,
maxy
-
miny
),
maxz
-
minz
)
/
255.0
);
else
binWidth
=
(
float
)
(
0.2
*
nonbonded
->
getCutoffDistance
());
float
invBinWidth
=
1.0
f
/
binWidth
;
binWidth
=
(
Real
)
(
0.2
*
nonbonded
->
getCutoffDistance
());
Real
invBinWidth
=
(
Real
)
(
1.0
/
binWidth
)
;
int
xbins
=
1
+
(
int
)
((
maxx
-
minx
)
*
invBinWidth
);
int
ybins
=
1
+
(
int
)
((
maxy
-
miny
)
*
invBinWidth
);
vector
<
pair
<
int
,
int
>
>
molBins
(
numMolecules
);
...
...
@@ -928,6 +1050,8 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
int
newIndex
=
mol
.
offsets
[
i
]
+
atoms
[
j
];
originalIndex
[
newIndex
]
=
atomIndex
[
oldIndex
];
newPosq
[
newIndex
]
=
oldPosq
[
oldIndex
];
if
(
useMixedPrecision
)
newPosqCorrection
[
newIndex
]
=
oldPosqCorrection
[
oldIndex
];
newVelm
[
newIndex
]
=
oldVelm
[
oldIndex
];
newCellOffsets
[
newIndex
]
=
posCellOffsets
[
oldIndex
];
}
...
...
@@ -941,6 +1065,8 @@ void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
posCellOffsets
[
i
]
=
newCellOffsets
[
i
];
}
posq
->
upload
(
newPosq
);
if
(
useMixedPrecision
)
posqCorrection
->
upload
(
newPosqCorrection
);
velm
->
upload
(
newVelm
);
atomIndexDevice
->
upload
(
atomIndex
);
for
(
int
i
=
0
;
i
<
(
int
)
reorderListeners
.
size
();
i
++
)
...
...
platforms/opencl/src/OpenCLContext.h
View file @
8d6a2a01
...
...
@@ -62,7 +62,7 @@ struct mm_float2 {
mm_float2
(
cl_float
x
,
cl_float
y
)
:
x
(
x
),
y
(
y
)
{
}
};
struct
mm_float4
{
struct
mm_float4
{
cl_float
x
,
y
,
z
,
w
;
mm_float4
()
{
}
...
...
@@ -87,6 +87,20 @@ struct mm_float16 {
s8
(
s8
),
s9
(
s9
),
s10
(
s10
),
s11
(
s11
),
s12
(
s12
),
s13
(
s13
),
s14
(
s14
),
s15
(
15
)
{
}
};
struct
mm_double2
{
cl_double
x
,
y
;
mm_double2
()
{
}
mm_double2
(
cl_double
x
,
cl_double
y
)
:
x
(
x
),
y
(
y
)
{
}
};
struct
mm_double4
{
cl_double
x
,
y
,
z
,
w
;
mm_double4
()
{
}
mm_double4
(
cl_double
x
,
cl_double
y
,
cl_double
z
,
cl_double
w
)
:
x
(
x
),
y
(
y
),
z
(
z
),
w
(
w
)
{
}
};
struct
mm_ushort2
{
cl_ushort
x
,
y
;
mm_ushort2
()
{
...
...
@@ -145,7 +159,7 @@ public:
class
ReorderListener
;
static
const
int
ThreadBlockSize
;
static
const
int
TileSize
;
OpenCLContext
(
const
System
&
system
,
int
platformIndex
,
int
deviceIndex
,
OpenCLPlatform
::
PlatformData
&
platformData
);
OpenCLContext
(
const
System
&
system
,
int
platformIndex
,
int
deviceIndex
,
const
std
::
string
&
precision
,
OpenCLPlatform
::
PlatformData
&
platformData
);
~
OpenCLContext
();
/**
* This is called to initialize internal data structures after all Forces in the system
...
...
@@ -198,6 +212,12 @@ public:
OpenCLArray
&
getPosq
()
{
return
*
posq
;
}
/**
* Get the array which contains a correction to the position of each atom. This only exists if getUseMixedPrecision() returns true.
*/
OpenCLArray
&
getPosqCorrection
()
{
return
*
posqCorrection
;
}
/**
* Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom.
*/
...
...
@@ -405,18 +425,38 @@ public:
bool
getSupportsDoublePrecision
()
{
return
supportsDoublePrecision
;
}
/**
* Get whether double precision is being used.
*/
bool
getUseDoublePrecision
()
{
return
useDoublePrecision
;
}
/**
* Get whether mixed precision is being used.
*/
bool
getUseMixedPrecision
()
{
return
useMixedPrecision
;
}
/**
* Get the size of the periodic box.
*/
mm_float4
getPeriodicBoxSize
()
const
{
return
periodicBoxSize
;
}
/**
* Get the size of the periodic box.
*/
mm_double4
getPeriodicBoxSizeDouble
()
const
{
return
periodicBoxSizeDouble
;
}
/**
* Set the size of the periodic box.
*/
void
setPeriodicBoxSize
(
double
xsize
,
double
ysize
,
double
zsize
)
{
periodicBoxSize
=
mm_float4
((
float
)
xsize
,
(
float
)
ysize
,
(
float
)
zsize
,
0
);
invPeriodicBoxSize
=
mm_float4
((
float
)
(
1.0
/
xsize
),
(
float
)
(
1.0
/
ysize
),
(
float
)
(
1.0
/
zsize
),
0
);
periodicBoxSizeDouble
=
mm_double4
(
xsize
,
ysize
,
zsize
,
0
);
invPeriodicBoxSizeDouble
=
mm_double4
(
1.0
/
xsize
,
1.0
/
ysize
,
1.0
/
zsize
,
0
);
}
/**
* Get the inverse of the size of the periodic box.
...
...
@@ -424,6 +464,12 @@ public:
mm_float4
getInvPeriodicBoxSize
()
const
{
return
invPeriodicBoxSize
;
}
/**
* Get the inverse of the size of the periodic box.
*/
mm_double4
getInvPeriodicBoxSizeDouble
()
const
{
return
invPeriodicBoxSizeDouble
;
}
/**
* Get the OpenCLIntegrationUtilities for this context.
*/
...
...
@@ -502,6 +548,11 @@ private:
* of molecules and resort the atoms.
*/
void
validateMolecules
();
/**
* This is the internal implementation of reorderAtoms(), templatized by the numerical precision in use.
*/
template
<
class
Real
,
class
Real4
,
class
Mixed
,
class
Mixed4
>
void
reorderAtomsImpl
(
bool
enforcePeriodic
);
const
System
&
system
;
double
time
;
OpenCLPlatform
::
PlatformData
&
platformData
;
...
...
@@ -515,9 +566,9 @@ private:
int
numThreadBlocks
;
int
numForceBuffers
;
int
simdWidth
;
bool
supports64BitGlobalAtomics
,
supportsDoublePrecision
,
atomsWereReordered
,
moleculesInvalid
;
mm_float4
periodicBoxSize
;
mm_
float4
invPeriodicBoxSize
;
bool
supports64BitGlobalAtomics
,
supportsDoublePrecision
,
useDoublePrecision
,
useMixedPrecision
,
atomsWereReordered
,
moleculesInvalid
;
mm_float4
periodicBoxSize
,
invPeriodicBoxSize
;
mm_
double4
periodicBoxSizeDouble
,
invPeriodicBoxSize
Double
;
std
::
string
defaultOptimizationOptions
;
std
::
map
<
std
::
string
,
std
::
string
>
compilationDefines
;
cl
::
Context
context
;
...
...
@@ -538,6 +589,7 @@ private:
cl
::
Buffer
*
pinnedBuffer
;
void
*
pinnedMemory
;
OpenCLArray
*
posq
;
OpenCLArray
*
posqCorrection
;
OpenCLArray
*
velm
;
OpenCLArray
*
force
;
OpenCLArray
*
forceBuffers
;
...
...
platforms/opencl/src/OpenCLIntegrationUtilities.cpp
View file @
8d6a2a01
...
...
@@ -87,6 +87,13 @@ struct OpenCLIntegrationUtilities::ConstraintOrderer : public binary_function<in
}
};
static
void
setPosqCorrectionArg
(
OpenCLContext
&
cl
,
cl
::
Kernel
&
kernel
,
int
index
)
{
if
(
cl
.
getUseMixedPrecision
())
kernel
.
setArg
<
cl
::
Buffer
>
(
index
,
cl
.
getPosqCorrection
().
getDeviceBuffer
());
else
kernel
.
setArg
<
void
*>
(
index
,
NULL
);
}
OpenCLIntegrationUtilities
::
OpenCLIntegrationUtilities
(
OpenCLContext
&
context
,
const
System
&
system
)
:
context
(
context
),
posDelta
(
NULL
),
settleAtoms
(
NULL
),
settleParams
(
NULL
),
shakeAtoms
(
NULL
),
shakeParams
(
NULL
),
random
(
NULL
),
randomSeed
(
NULL
),
randomPos
(
0
),
stepSize
(
NULL
),
ccmaAtoms
(
NULL
),
ccmaDistance
(
NULL
),
...
...
@@ -96,12 +103,22 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
vsiteOutOfPlaneAtoms
(
NULL
),
vsiteOutOfPlaneWeights
(
NULL
),
hasInitializedPosConstraintKernels
(
false
),
hasInitializedVelConstraintKernels
(
false
)
{
// Create workspace arrays.
posDelta
=
OpenCLArray
::
create
<
mm_float4
>
(
context
,
context
.
getPaddedNumAtoms
(),
"posDelta"
);
vector
<
mm_float4
>
deltas
(
posDelta
->
getSize
(),
mm_float4
(
0.0
,
0.0
,
0.0
,
0.0
));
posDelta
->
upload
(
deltas
);
stepSize
=
OpenCLArray
::
create
<
mm_float2
>
(
context
,
1
,
"stepSize"
);
vector
<
mm_float2
>
step
(
1
,
mm_float2
(
0.0
f
,
0.0
f
));
stepSize
->
upload
(
step
);
if
(
context
.
getUseDoublePrecision
()
||
context
.
getUseMixedPrecision
())
{
posDelta
=
OpenCLArray
::
create
<
mm_double4
>
(
context
,
context
.
getPaddedNumAtoms
(),
"posDelta"
);
vector
<
mm_double4
>
deltas
(
posDelta
->
getSize
(),
mm_double4
(
0.0
,
0.0
,
0.0
,
0.0
));
posDelta
->
upload
(
deltas
);
stepSize
=
OpenCLArray
::
create
<
mm_double2
>
(
context
,
1
,
"stepSize"
);
vector
<
mm_double2
>
step
(
1
,
mm_double2
(
0.0
,
0.0
));
stepSize
->
upload
(
step
);
}
else
{
posDelta
=
OpenCLArray
::
create
<
mm_float4
>
(
context
,
context
.
getPaddedNumAtoms
(),
"posDelta"
);
vector
<
mm_float4
>
deltas
(
posDelta
->
getSize
(),
mm_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
));
posDelta
->
upload
(
deltas
);
stepSize
=
OpenCLArray
::
create
<
mm_float2
>
(
context
,
1
,
"stepSize"
);
vector
<
mm_float2
>
step
(
1
,
mm_float2
(
0.0
f
,
0.0
f
));
stepSize
->
upload
(
step
);
}
// Create kernels for enforcing constraints.
...
...
@@ -458,51 +475,86 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
// Record the CCMA data structures.
ccmaAtoms
=
OpenCLArray
::
create
<
mm_int2
>
(
context
,
numCCMA
,
"CcmaAtoms"
);
ccmaDistance
=
OpenCLArray
::
create
<
mm_float4
>
(
context
,
numCCMA
,
"CcmaDistance"
);
ccmaAtomConstraints
=
OpenCLArray
::
create
<
cl_int
>
(
context
,
numAtoms
*
maxAtomConstraints
,
"CcmaAtomConstraints"
);
ccmaNumAtomConstraints
=
OpenCLArray
::
create
<
cl_int
>
(
context
,
numAtoms
,
"CcmaAtomConstraintsIndex"
);
ccmaDelta1
=
OpenCLArray
::
create
<
cl_float
>
(
context
,
numCCMA
,
"CcmaDelta1"
);
ccmaDelta2
=
OpenCLArray
::
create
<
cl_float
>
(
context
,
numCCMA
,
"CcmaDelta2"
);
ccmaConstraintMatrixColumn
=
OpenCLArray
::
create
<
cl_int
>
(
context
,
numCCMA
*
maxRowElements
,
"ConstraintMatrixColumn"
);
ccmaConverged
=
OpenCLArray
::
create
<
cl_int
>
(
context
,
2
,
"CcmaConverged"
);
ccmaConvergedBuffer
=
new
cl
::
Buffer
(
context
.
getContext
(),
CL_MEM_ALLOC_HOST_PTR
,
2
*
sizeof
(
cl_int
));
ccmaConvergedMemory
=
(
cl_int
*
)
context
.
getQueue
().
enqueueMapBuffer
(
*
ccmaConvergedBuffer
,
CL_TRUE
,
CL_MAP_READ
|
CL_MAP_WRITE
,
0
,
2
*
sizeof
(
cl_int
));
ccmaReducedMass
=
OpenCLArray
::
create
<
cl_float
>
(
context
,
numCCMA
,
"CcmaReducedMass"
);
ccmaConstraintMatrixColumn
=
OpenCLArray
::
create
<
cl_int
>
(
context
,
numCCMA
*
maxRowElements
,
"ConstraintMatrixColumn"
);
ccmaConstraintMatrixValue
=
OpenCLArray
::
create
<
cl_float
>
(
context
,
numCCMA
*
maxRowElements
,
"ConstraintMatrixValue"
);
vector
<
mm_int2
>
atomsVec
(
ccmaAtoms
->
getSize
());
vector
<
mm_float4
>
distanceVec
(
ccmaDistance
->
getSize
());
vector
<
cl_int
>
atomConstraintsVec
(
ccmaAtomConstraints
->
getSize
());
vector
<
cl_int
>
numAtomConstraintsVec
(
ccmaNumAtomConstraints
->
getSize
());
vector
<
cl_float
>
reducedMassVec
(
ccmaReducedMass
->
getSize
());
vector
<
cl_int
>
constraintMatrixColumnVec
(
ccmaConstraintMatrixColumn
->
getSize
());
vector
<
cl_float
>
constraintMatrixValueVec
(
ccmaConstraintMatrixValue
->
getSize
());
for
(
int
i
=
0
;
i
<
numCCMA
;
i
++
)
{
int
index
=
constraintOrder
[
i
];
int
c
=
ccmaConstraints
[
index
];
atomsVec
[
i
].
x
=
atom1
[
c
];
atomsVec
[
i
].
y
=
atom2
[
c
];
distanceVec
[
i
].
w
=
(
float
)
distance
[
c
];
reducedMassVec
[
i
]
=
(
float
)
(
0.5
/
(
1.0
/
system
.
getParticleMass
(
atom1
[
c
])
+
1.0
/
system
.
getParticleMass
(
atom2
[
c
])));
for
(
unsigned
int
j
=
0
;
j
<
matrix
[
index
].
size
();
j
++
)
{
constraintMatrixColumnVec
[
i
+
j
*
numCCMA
]
=
matrix
[
index
][
j
].
first
;
constraintMatrixValueVec
[
i
+
j
*
numCCMA
]
=
(
float
)
matrix
[
index
][
j
].
second
;
if
(
context
.
getUseDoublePrecision
()
||
context
.
getUseMixedPrecision
())
{
ccmaDistance
=
OpenCLArray
::
create
<
mm_double4
>
(
context
,
numCCMA
,
"CcmaDistance"
);
ccmaDelta1
=
OpenCLArray
::
create
<
cl_double
>
(
context
,
numCCMA
,
"CcmaDelta1"
);
ccmaDelta2
=
OpenCLArray
::
create
<
cl_double
>
(
context
,
numCCMA
,
"CcmaDelta2"
);
ccmaReducedMass
=
OpenCLArray
::
create
<
cl_double
>
(
context
,
numCCMA
,
"CcmaReducedMass"
);
ccmaConstraintMatrixValue
=
OpenCLArray
::
create
<
cl_double
>
(
context
,
numCCMA
*
maxRowElements
,
"ConstraintMatrixValue"
);
vector
<
mm_double4
>
distanceVec
(
ccmaDistance
->
getSize
());
vector
<
cl_double
>
reducedMassVec
(
ccmaReducedMass
->
getSize
());
vector
<
cl_double
>
constraintMatrixValueVec
(
ccmaConstraintMatrixValue
->
getSize
());
for
(
int
i
=
0
;
i
<
numCCMA
;
i
++
)
{
int
index
=
constraintOrder
[
i
];
int
c
=
ccmaConstraints
[
index
];
atomsVec
[
i
].
x
=
atom1
[
c
];
atomsVec
[
i
].
y
=
atom2
[
c
];
distanceVec
[
i
].
w
=
distance
[
c
];
reducedMassVec
[
i
]
=
(
0.5
/
(
1.0
/
system
.
getParticleMass
(
atom1
[
c
])
+
1.0
/
system
.
getParticleMass
(
atom2
[
c
])));
for
(
unsigned
int
j
=
0
;
j
<
matrix
[
index
].
size
();
j
++
)
{
constraintMatrixColumnVec
[
i
+
j
*
numCCMA
]
=
matrix
[
index
][
j
].
first
;
constraintMatrixValueVec
[
i
+
j
*
numCCMA
]
=
matrix
[
index
][
j
].
second
;
}
constraintMatrixColumnVec
[
i
+
matrix
[
index
].
size
()
*
numCCMA
]
=
numCCMA
;
}
for
(
unsigned
int
i
=
0
;
i
<
atomConstraints
.
size
();
i
++
)
{
numAtomConstraintsVec
[
i
]
=
atomConstraints
[
i
].
size
();
for
(
unsigned
int
j
=
0
;
j
<
atomConstraints
[
i
].
size
();
j
++
)
{
bool
forward
=
(
atom1
[
ccmaConstraints
[
atomConstraints
[
i
][
j
]]]
==
i
);
atomConstraintsVec
[
i
+
j
*
numAtoms
]
=
(
forward
?
inverseOrder
[
atomConstraints
[
i
][
j
]]
+
1
:
-
inverseOrder
[
atomConstraints
[
i
][
j
]]
-
1
);
}
}
constraintMatrixColumnVec
[
i
+
matrix
[
index
].
size
()
*
numCCMA
]
=
numCCMA
;
ccmaDistance
->
upload
(
distanceVec
);
ccmaReducedMass
->
upload
(
reducedMassVec
);
ccmaConstraintMatrixValue
->
upload
(
constraintMatrixValueVec
);
}
for
(
unsigned
int
i
=
0
;
i
<
atomConstraints
.
size
();
i
++
)
{
numAtomConstraintsVec
[
i
]
=
atomConstraints
[
i
].
size
();
for
(
unsigned
int
j
=
0
;
j
<
atomConstraints
[
i
].
size
();
j
++
)
{
bool
forward
=
(
atom1
[
ccmaConstraints
[
atomConstraints
[
i
][
j
]]]
==
i
);
atomConstraintsVec
[
i
+
j
*
numAtoms
]
=
(
forward
?
inverseOrder
[
atomConstraints
[
i
][
j
]]
+
1
:
-
inverseOrder
[
atomConstraints
[
i
][
j
]]
-
1
);
else
{
ccmaDistance
=
OpenCLArray
::
create
<
mm_float4
>
(
context
,
numCCMA
,
"CcmaDistance"
);
ccmaDelta1
=
OpenCLArray
::
create
<
cl_float
>
(
context
,
numCCMA
,
"CcmaDelta1"
);
ccmaDelta2
=
OpenCLArray
::
create
<
cl_float
>
(
context
,
numCCMA
,
"CcmaDelta2"
);
ccmaReducedMass
=
OpenCLArray
::
create
<
cl_float
>
(
context
,
numCCMA
,
"CcmaReducedMass"
);
ccmaConstraintMatrixValue
=
OpenCLArray
::
create
<
cl_float
>
(
context
,
numCCMA
*
maxRowElements
,
"ConstraintMatrixValue"
);
vector
<
mm_float4
>
distanceVec
(
ccmaDistance
->
getSize
());
vector
<
cl_float
>
reducedMassVec
(
ccmaReducedMass
->
getSize
());
vector
<
cl_float
>
constraintMatrixValueVec
(
ccmaConstraintMatrixValue
->
getSize
());
for
(
int
i
=
0
;
i
<
numCCMA
;
i
++
)
{
int
index
=
constraintOrder
[
i
];
int
c
=
ccmaConstraints
[
index
];
atomsVec
[
i
].
x
=
atom1
[
c
];
atomsVec
[
i
].
y
=
atom2
[
c
];
distanceVec
[
i
].
w
=
(
float
)
distance
[
c
];
reducedMassVec
[
i
]
=
(
float
)
(
0.5
/
(
1.0
/
system
.
getParticleMass
(
atom1
[
c
])
+
1.0
/
system
.
getParticleMass
(
atom2
[
c
])));
for
(
unsigned
int
j
=
0
;
j
<
matrix
[
index
].
size
();
j
++
)
{
constraintMatrixColumnVec
[
i
+
j
*
numCCMA
]
=
matrix
[
index
][
j
].
first
;
constraintMatrixValueVec
[
i
+
j
*
numCCMA
]
=
(
float
)
matrix
[
index
][
j
].
second
;
}
constraintMatrixColumnVec
[
i
+
matrix
[
index
].
size
()
*
numCCMA
]
=
numCCMA
;
}
for
(
unsigned
int
i
=
0
;
i
<
atomConstraints
.
size
();
i
++
)
{
numAtomConstraintsVec
[
i
]
=
atomConstraints
[
i
].
size
();
for
(
unsigned
int
j
=
0
;
j
<
atomConstraints
[
i
].
size
();
j
++
)
{
bool
forward
=
(
atom1
[
ccmaConstraints
[
atomConstraints
[
i
][
j
]]]
==
i
);
atomConstraintsVec
[
i
+
j
*
numAtoms
]
=
(
forward
?
inverseOrder
[
atomConstraints
[
i
][
j
]]
+
1
:
-
inverseOrder
[
atomConstraints
[
i
][
j
]]
-
1
);
}
}
ccmaDistance
->
upload
(
distanceVec
);
ccmaReducedMass
->
upload
(
reducedMassVec
);
ccmaConstraintMatrixValue
->
upload
(
constraintMatrixValueVec
);
}
ccmaAtoms
->
upload
(
atomsVec
);
ccmaDistance
->
upload
(
distanceVec
);
ccmaAtomConstraints
->
upload
(
atomConstraintsVec
);
ccmaNumAtomConstraints
->
upload
(
numAtomConstraintsVec
);
ccmaReducedMass
->
upload
(
reducedMassVec
);
ccmaConstraintMatrixColumn
->
upload
(
constraintMatrixColumnVec
);
ccmaConstraintMatrixValue
->
upload
(
constraintMatrixValueVec
);
// Create the CCMA kernels.
...
...
@@ -584,21 +636,23 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
cl
::
Program
vsiteProgram
=
context
.
createProgram
(
OpenCLKernelSources
::
virtualSites
,
defines
);
vsitePositionKernel
=
cl
::
Kernel
(
vsiteProgram
,
"computeVirtualSites"
);
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
context
.
getPosq
().
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
vsite2AvgAtoms
->
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
vsite2AvgWeights
->
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
vsite3AvgAtoms
->
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
vsite3AvgWeights
->
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
vsiteOutOfPlaneAtoms
->
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
vsiteOutOfPlaneWeights
->
getDeviceBuffer
());
setPosqCorrectionArg
(
context
,
vsitePositionKernel
,
1
);
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
vsite2AvgAtoms
->
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
vsite2AvgWeights
->
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
vsite3AvgAtoms
->
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
vsite3AvgWeights
->
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
vsiteOutOfPlaneAtoms
->
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
vsiteOutOfPlaneWeights
->
getDeviceBuffer
());
vsiteForceKernel
=
cl
::
Kernel
(
vsiteProgram
,
"distributeForces"
);
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
context
.
getPosq
().
getDeviceBuffer
());
// Skip argument 1: the force array hasn't been created yet.
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
vsite2AvgAtoms
->
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
vsite2AvgWeights
->
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
vsite3AvgAtoms
->
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
vsite3AvgWeights
->
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
vsiteOutOfPlaneAtoms
->
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
vsiteOutOfPlaneWeights
->
getDeviceBuffer
());
setPosqCorrectionArg
(
context
,
vsiteForceKernel
,
1
);
// Skip argument 2: the force array hasn't been created yet.
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
vsite2AvgAtoms
->
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
vsite2AvgWeights
->
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
vsite3AvgAtoms
->
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
vsite3AvgWeights
->
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
vsiteOutOfPlaneAtoms
->
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
vsiteOutOfPlaneWeights
->
getDeviceBuffer
());
numVsites
=
num2Avg
+
num3Avg
+
numOutOfPlane
;
}
...
...
@@ -686,23 +740,37 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
if
(
!
hasInitialized
)
{
settleKernel
.
setArg
<
cl_int
>
(
0
,
settleAtoms
->
getSize
());
settleKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
context
.
getPosq
().
getDeviceBuffer
());
settleKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
posDelta
->
getDeviceBuffer
());
settleKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
context
.
getVelm
().
getDeviceBuffer
());
settleKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
settleAtoms
->
getDeviceBuffer
());
settleKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
settleParams
->
getDeviceBuffer
());
if
(
context
.
getUseMixedPrecision
())
settleKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
context
.
getPosqCorrection
().
getDeviceBuffer
());
else
settleKernel
.
setArg
<
void
*>
(
3
,
NULL
);
settleKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
posDelta
->
getDeviceBuffer
());
settleKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
context
.
getVelm
().
getDeviceBuffer
());
settleKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
settleAtoms
->
getDeviceBuffer
());
settleKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
settleParams
->
getDeviceBuffer
());
}
settleKernel
.
setArg
<
cl_float
>
(
1
,
(
cl_float
)
tol
);
if
(
context
.
getUseDoublePrecision
()
||
context
.
getUseMixedPrecision
())
settleKernel
.
setArg
<
cl_double
>
(
1
,
(
cl_double
)
tol
);
else
settleKernel
.
setArg
<
cl_float
>
(
1
,
(
cl_float
)
tol
);
context
.
executeKernel
(
settleKernel
,
settleAtoms
->
getSize
());
}
if
(
shakeAtoms
!=
NULL
)
{
if
(
!
hasInitialized
)
{
shakeKernel
.
setArg
<
cl_int
>
(
0
,
shakeAtoms
->
getSize
());
shakeKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
context
.
getPosq
().
getDeviceBuffer
());
shakeKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
constrainVelocities
?
context
.
getVelm
().
getDeviceBuffer
()
:
posDelta
->
getDeviceBuffer
());
shakeKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
shakeAtoms
->
getDeviceBuffer
());
shakeKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
shakeParams
->
getDeviceBuffer
());
if
(
context
.
getUseMixedPrecision
())
shakeKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
context
.
getPosqCorrection
().
getDeviceBuffer
());
else
shakeKernel
.
setArg
<
void
*>
(
3
,
NULL
);
shakeKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
constrainVelocities
?
context
.
getVelm
().
getDeviceBuffer
()
:
posDelta
->
getDeviceBuffer
());
shakeKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
shakeAtoms
->
getDeviceBuffer
());
shakeKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
shakeParams
->
getDeviceBuffer
());
}
shakeKernel
.
setArg
<
cl_float
>
(
1
,
(
cl_float
)
tol
);
if
(
context
.
getUseDoublePrecision
()
||
context
.
getUseMixedPrecision
())
shakeKernel
.
setArg
<
cl_double
>
(
1
,
(
cl_double
)
tol
);
else
shakeKernel
.
setArg
<
cl_float
>
(
1
,
(
cl_float
)
tol
);
context
.
executeKernel
(
shakeKernel
,
shakeAtoms
->
getSize
());
}
if
(
ccmaAtoms
!=
NULL
)
{
...
...
@@ -710,6 +778,10 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
ccmaDirectionsKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
ccmaAtoms
->
getDeviceBuffer
());
ccmaDirectionsKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
ccmaDistance
->
getDeviceBuffer
());
ccmaDirectionsKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
context
.
getPosq
().
getDeviceBuffer
());
if
(
context
.
getUseMixedPrecision
())
ccmaDirectionsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
context
.
getPosqCorrection
().
getDeviceBuffer
());
else
ccmaDirectionsKernel
.
setArg
<
void
*>
(
3
,
NULL
);
ccmaForceKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
ccmaAtoms
->
getDeviceBuffer
());
ccmaForceKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
ccmaDistance
->
getDeviceBuffer
());
ccmaForceKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
constrainVelocities
?
context
.
getVelm
().
getDeviceBuffer
()
:
posDelta
->
getDeviceBuffer
());
...
...
@@ -730,7 +802,10 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
ccmaUpdateKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
ccmaDelta2
->
getDeviceBuffer
());
ccmaUpdateKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
ccmaConverged
->
getDeviceBuffer
());
}
ccmaForceKernel
.
setArg
<
cl_float
>
(
6
,
(
cl_float
)
tol
);
if
(
context
.
getUseDoublePrecision
()
||
context
.
getUseMixedPrecision
())
ccmaForceKernel
.
setArg
<
cl_double
>
(
6
,
(
cl_double
)
tol
);
else
ccmaForceKernel
.
setArg
<
cl_float
>
(
6
,
(
cl_float
)
tol
);
context
.
executeKernel
(
ccmaDirectionsKernel
,
ccmaAtoms
->
getSize
());
const
int
checkInterval
=
4
;
cl
::
Event
event
;
...
...
@@ -764,7 +839,7 @@ void OpenCLIntegrationUtilities::computeVirtualSites() {
void
OpenCLIntegrationUtilities
::
distributeForcesFromVirtualSites
()
{
if
(
numVsites
>
0
)
{
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
context
.
getForce
().
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
context
.
getForce
().
getDeviceBuffer
());
context
.
executeKernel
(
vsiteForceKernel
,
numVsites
);
}
}
...
...
platforms/opencl/src/OpenCLKernels.cpp
View file @
8d6a2a01
...
...
@@ -66,6 +66,13 @@ static string intToString(int value) {
return
s
.
str
();
}
static
void
setPosqCorrectionArg
(
OpenCLContext
&
cl
,
cl
::
Kernel
&
kernel
,
int
index
)
{
if
(
cl
.
getUseMixedPrecision
())
kernel
.
setArg
<
cl
::
Buffer
>
(
index
,
cl
.
getPosqCorrection
().
getDeviceBuffer
());
else
kernel
.
setArg
<
void
*>
(
index
,
NULL
);
}
static
bool
isZeroExpression
(
const
Lepton
::
ParsedExpression
&
expression
)
{
const
Lepton
::
Operation
&
op
=
expression
.
getRootNode
().
getOperation
();
if
(
op
.
getId
()
!=
Lepton
::
Operation
::
CONSTANT
)
...
...
@@ -139,81 +146,172 @@ void OpenCLUpdateStateDataKernel::setTime(ContextImpl& context, double time) {
}
void
OpenCLUpdateStateDataKernel
::
getPositions
(
ContextImpl
&
context
,
vector
<
Vec3
>&
positions
)
{
mm_float4
*
posq
=
(
mm_float4
*
)
cl
.
getPinnedBuffer
();
cl
.
getPosq
().
download
(
posq
);
const
vector
<
cl_int
>&
order
=
cl
.
getAtomIndex
();
int
numParticles
=
context
.
getSystem
().
getNumParticles
();
positions
.
resize
(
numParticles
);
mm_float4
periodicBoxSize
=
cl
.
getPeriodicBoxSize
();
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
mm_float4
pos
=
posq
[
i
];
mm_int4
offset
=
cl
.
getPosCellOffsets
()[
i
];
positions
[
order
[
i
]]
=
Vec3
(
pos
.
x
-
offset
.
x
*
periodicBoxSize
.
x
,
pos
.
y
-
offset
.
y
*
periodicBoxSize
.
y
,
pos
.
z
-
offset
.
z
*
periodicBoxSize
.
z
);
mm_double4
periodicBoxSize
=
cl
.
getPeriodicBoxSizeDouble
();
if
(
cl
.
getUseDoublePrecision
())
{
mm_double4
*
posq
=
(
mm_double4
*
)
cl
.
getPinnedBuffer
();
cl
.
getPosq
().
download
(
posq
);
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
mm_double4
pos
=
posq
[
i
];
mm_int4
offset
=
cl
.
getPosCellOffsets
()[
i
];
positions
[
order
[
i
]]
=
Vec3
(
pos
.
x
-
offset
.
x
*
periodicBoxSize
.
x
,
pos
.
y
-
offset
.
y
*
periodicBoxSize
.
y
,
pos
.
z
-
offset
.
z
*
periodicBoxSize
.
z
);
}
}
else
if
(
cl
.
getUseMixedPrecision
())
{
mm_float4
*
posq
=
(
mm_float4
*
)
cl
.
getPinnedBuffer
();
vector
<
mm_float4
>
posCorrection
;
cl
.
getPosq
().
download
(
posq
);
cl
.
getPosqCorrection
().
download
(
posCorrection
);
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
mm_float4
pos1
=
posq
[
i
];
mm_float4
pos2
=
posCorrection
[
i
];
mm_int4
offset
=
cl
.
getPosCellOffsets
()[
i
];
positions
[
order
[
i
]]
=
Vec3
((
double
)
pos1
.
x
+
(
double
)
pos2
.
x
-
offset
.
x
*
periodicBoxSize
.
x
,
(
double
)
pos1
.
y
+
(
double
)
pos2
.
y
-
offset
.
y
*
periodicBoxSize
.
y
,
(
double
)
pos1
.
z
+
(
double
)
pos2
.
z
-
offset
.
z
*
periodicBoxSize
.
z
);
}
}
else
{
mm_float4
*
posq
=
(
mm_float4
*
)
cl
.
getPinnedBuffer
();
cl
.
getPosq
().
download
(
posq
);
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
mm_float4
pos
=
posq
[
i
];
mm_int4
offset
=
cl
.
getPosCellOffsets
()[
i
];
positions
[
order
[
i
]]
=
Vec3
(
pos
.
x
-
offset
.
x
*
periodicBoxSize
.
x
,
pos
.
y
-
offset
.
y
*
periodicBoxSize
.
y
,
pos
.
z
-
offset
.
z
*
periodicBoxSize
.
z
);
}
}
}
void
OpenCLUpdateStateDataKernel
::
setPositions
(
ContextImpl
&
context
,
const
vector
<
Vec3
>&
positions
)
{
mm_float4
*
posq
=
(
mm_float4
*
)
cl
.
getPinnedBuffer
();
cl
.
getPosq
().
download
(
posq
);
const
vector
<
cl_int
>&
order
=
cl
.
getAtomIndex
();
int
numParticles
=
context
.
getSystem
().
getNumParticles
();
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
mm_float4
&
pos
=
posq
[
i
];
const
Vec3
&
p
=
positions
[
order
[
i
]];
pos
.
x
=
(
cl_float
)
p
[
0
];
pos
.
y
=
(
cl_float
)
p
[
1
];
pos
.
z
=
(
cl_float
)
p
[
2
];
}
for
(
int
i
=
numParticles
;
i
<
cl
.
getPaddedNumAtoms
();
i
++
)
posq
[
i
]
=
mm_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
);
cl
.
getPosq
().
upload
(
posq
);
if
(
cl
.
getUseDoublePrecision
())
{
mm_double4
*
posq
=
(
mm_double4
*
)
cl
.
getPinnedBuffer
();
cl
.
getPosq
().
download
(
posq
);
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
mm_double4
&
pos
=
posq
[
i
];
const
Vec3
&
p
=
positions
[
order
[
i
]];
pos
.
x
=
p
[
0
];
pos
.
y
=
p
[
1
];
pos
.
z
=
p
[
2
];
}
for
(
int
i
=
numParticles
;
i
<
cl
.
getPaddedNumAtoms
();
i
++
)
posq
[
i
]
=
mm_double4
(
0.0
,
0.0
,
0.0
,
0.0
);
cl
.
getPosq
().
upload
(
posq
);
}
else
{
mm_float4
*
posq
=
(
mm_float4
*
)
cl
.
getPinnedBuffer
();
cl
.
getPosq
().
download
(
posq
);
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
mm_float4
&
pos
=
posq
[
i
];
const
Vec3
&
p
=
positions
[
order
[
i
]];
pos
.
x
=
(
cl_float
)
p
[
0
];
pos
.
y
=
(
cl_float
)
p
[
1
];
pos
.
z
=
(
cl_float
)
p
[
2
];
}
for
(
int
i
=
numParticles
;
i
<
cl
.
getPaddedNumAtoms
();
i
++
)
posq
[
i
]
=
mm_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
);
cl
.
getPosq
().
upload
(
posq
);
}
if
(
cl
.
getUseMixedPrecision
())
{
mm_float4
*
posCorrection
=
(
mm_float4
*
)
cl
.
getPinnedBuffer
();
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
mm_float4
&
c
=
posCorrection
[
i
];
const
Vec3
&
p
=
positions
[
order
[
i
]];
c
.
x
=
(
cl_float
)
(
p
[
0
]
-
(
cl_float
)
p
[
0
]);
c
.
y
=
(
cl_float
)
(
p
[
1
]
-
(
cl_float
)
p
[
1
]);
c
.
z
=
(
cl_float
)
(
p
[
2
]
-
(
cl_float
)
p
[
2
]);
c
.
w
=
0
;
}
for
(
int
i
=
numParticles
;
i
<
cl
.
getPaddedNumAtoms
();
i
++
)
posCorrection
[
i
]
=
mm_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
);
cl
.
getPosqCorrection
().
upload
(
posCorrection
);
}
for
(
int
i
=
0
;
i
<
(
int
)
cl
.
getPosCellOffsets
().
size
();
i
++
)
cl
.
getPosCellOffsets
()[
i
]
=
mm_int4
(
0
,
0
,
0
,
0
);
}
void
OpenCLUpdateStateDataKernel
::
getVelocities
(
ContextImpl
&
context
,
vector
<
Vec3
>&
velocities
)
{
mm_float4
*
velm
=
(
mm_float4
*
)
cl
.
getPinnedBuffer
();
cl
.
getVelm
().
download
(
velm
);
const
vector
<
cl_int
>&
order
=
cl
.
getAtomIndex
();
int
numParticles
=
context
.
getSystem
().
getNumParticles
();
velocities
.
resize
(
numParticles
);
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
mm_float4
vel
=
velm
[
i
];
velocities
[
order
[
i
]]
=
Vec3
(
vel
.
x
,
vel
.
y
,
vel
.
z
);
if
(
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
())
{
mm_double4
*
velm
=
(
mm_double4
*
)
cl
.
getPinnedBuffer
();
cl
.
getVelm
().
download
(
velm
);
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
mm_double4
vel
=
velm
[
i
];
mm_int4
offset
=
cl
.
getPosCellOffsets
()[
i
];
velocities
[
order
[
i
]]
=
Vec3
(
vel
.
x
,
vel
.
y
,
vel
.
z
);
}
}
else
{
mm_float4
*
velm
=
(
mm_float4
*
)
cl
.
getPinnedBuffer
();
cl
.
getVelm
().
download
(
velm
);
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
mm_float4
vel
=
velm
[
i
];
mm_int4
offset
=
cl
.
getPosCellOffsets
()[
i
];
velocities
[
order
[
i
]]
=
Vec3
(
vel
.
x
,
vel
.
y
,
vel
.
z
);
}
}
}
void
OpenCLUpdateStateDataKernel
::
setVelocities
(
ContextImpl
&
context
,
const
vector
<
Vec3
>&
velocities
)
{
mm_float4
*
velm
=
(
mm_float4
*
)
cl
.
getPinnedBuffer
();
cl
.
getVelm
().
download
(
velm
);
const
vector
<
cl_int
>&
order
=
cl
.
getAtomIndex
();
int
numParticles
=
context
.
getSystem
().
getNumParticles
();
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
mm_float4
&
vel
=
velm
[
i
];
const
Vec3
&
p
=
velocities
[
order
[
i
]];
vel
.
x
=
(
cl_float
)
p
[
0
];
vel
.
y
=
(
cl_float
)
p
[
1
];
vel
.
z
=
(
cl_float
)
p
[
2
];
}
for
(
int
i
=
numParticles
;
i
<
cl
.
getPaddedNumAtoms
();
i
++
)
velm
[
i
]
=
mm_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
);
cl
.
getVelm
().
upload
(
velm
);
if
(
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
())
{
mm_double4
*
velm
=
(
mm_double4
*
)
cl
.
getPinnedBuffer
();
cl
.
getVelm
().
download
(
velm
);
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
mm_double4
&
vel
=
velm
[
i
];
const
Vec3
&
p
=
velocities
[
order
[
i
]];
vel
.
x
=
p
[
0
];
vel
.
y
=
p
[
1
];
vel
.
z
=
p
[
2
];
}
for
(
int
i
=
numParticles
;
i
<
cl
.
getPaddedNumAtoms
();
i
++
)
velm
[
i
]
=
mm_double4
(
0.0
,
0.0
,
0.0
,
0.0
);
cl
.
getVelm
().
upload
(
velm
);
}
else
{
mm_float4
*
velm
=
(
mm_float4
*
)
cl
.
getPinnedBuffer
();
cl
.
getVelm
().
download
(
velm
);
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
mm_float4
&
vel
=
velm
[
i
];
const
Vec3
&
p
=
velocities
[
order
[
i
]];
vel
.
x
=
p
[
0
];
vel
.
y
=
p
[
1
];
vel
.
z
=
p
[
2
];
}
for
(
int
i
=
numParticles
;
i
<
cl
.
getPaddedNumAtoms
();
i
++
)
velm
[
i
]
=
mm_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
);
cl
.
getVelm
().
upload
(
velm
);
}
}
void
OpenCLUpdateStateDataKernel
::
getForces
(
ContextImpl
&
context
,
vector
<
Vec3
>&
forces
)
{
mm_float4
*
force
=
(
mm_float4
*
)
cl
.
getPinnedBuffer
();
cl
.
getForce
().
download
(
force
);
const
vector
<
cl_int
>&
order
=
cl
.
getAtomIndex
();
int
numParticles
=
context
.
getSystem
().
getNumParticles
();
forces
.
resize
(
numParticles
);
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
mm_float4
f
=
force
[
i
];
forces
[
order
[
i
]]
=
Vec3
(
f
.
x
,
f
.
y
,
f
.
z
);
if
(
cl
.
getUseDoublePrecision
())
{
mm_double4
*
force
=
(
mm_double4
*
)
cl
.
getPinnedBuffer
();
cl
.
getForce
().
download
(
force
);
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
mm_double4
f
=
force
[
i
];
forces
[
order
[
i
]]
=
Vec3
(
f
.
x
,
f
.
y
,
f
.
z
);
}
}
else
{
mm_float4
*
force
=
(
mm_float4
*
)
cl
.
getPinnedBuffer
();
cl
.
getForce
().
download
(
force
);
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
mm_float4
f
=
force
[
i
];
forces
[
order
[
i
]]
=
Vec3
(
f
.
x
,
f
.
y
,
f
.
z
);
}
}
}
void
OpenCLUpdateStateDataKernel
::
getPeriodicBoxVectors
(
ContextImpl
&
context
,
Vec3
&
a
,
Vec3
&
b
,
Vec3
&
c
)
const
{
mm_
float
4
box
=
cl
.
getPeriodicBoxSize
();
mm_
double
4
box
=
cl
.
getPeriodicBoxSize
Double
();
a
=
Vec3
(
box
.
x
,
0
,
0
);
b
=
Vec3
(
0
,
box
.
y
,
0
);
c
=
Vec3
(
0
,
0
,
box
.
z
);
...
...
@@ -228,6 +326,8 @@ void OpenCLUpdateStateDataKernel::setPeriodicBoxVectors(ContextImpl& context, co
void
OpenCLUpdateStateDataKernel
::
createCheckpoint
(
ContextImpl
&
context
,
ostream
&
stream
)
{
int
version
=
1
;
stream
.
write
((
char
*
)
&
version
,
sizeof
(
int
));
int
precision
=
(
cl
.
getUseDoublePrecision
()
?
2
:
cl
.
getUseMixedPrecision
()
?
1
:
0
);
stream
.
write
((
char
*
)
&
precision
,
sizeof
(
int
));
double
time
=
cl
.
getTime
();
stream
.
write
((
char
*
)
&
time
,
sizeof
(
double
));
int
stepCount
=
cl
.
getStepCount
();
...
...
@@ -235,10 +335,14 @@ void OpenCLUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream
int
computeForceCount
=
cl
.
getComputeForceCount
();
stream
.
write
((
char
*
)
&
computeForceCount
,
sizeof
(
int
));
char
*
buffer
=
(
char
*
)
cl
.
getPinnedBuffer
();
cl
.
getPosq
().
download
((
mm_float4
*
)
buffer
);
stream
.
write
(
buffer
,
sizeof
(
mm_float4
)
*
cl
.
getPosq
().
getSize
());
cl
.
getVelm
().
download
((
mm_float4
*
)
buffer
);
stream
.
write
(
buffer
,
sizeof
(
mm_float4
)
*
cl
.
getVelm
().
getSize
());
cl
.
getPosq
().
download
(
buffer
);
stream
.
write
(
buffer
,
cl
.
getPosq
().
getSize
()
*
cl
.
getPosq
().
getElementSize
());
if
(
cl
.
getUseMixedPrecision
())
{
cl
.
getPosqCorrection
().
download
(
buffer
);
stream
.
write
(
buffer
,
cl
.
getPosqCorrection
().
getSize
()
*
cl
.
getPosqCorrection
().
getElementSize
());
}
cl
.
getVelm
().
download
(
buffer
);
stream
.
write
(
buffer
,
cl
.
getVelm
().
getSize
()
*
cl
.
getVelm
().
getElementSize
());
stream
.
write
((
char
*
)
&
cl
.
getAtomIndex
()[
0
],
sizeof
(
cl_int
)
*
cl
.
getAtomIndex
().
size
());
stream
.
write
((
char
*
)
&
cl
.
getPosCellOffsets
()[
0
],
sizeof
(
mm_int4
)
*
cl
.
getPosCellOffsets
().
size
());
mm_float4
box
=
cl
.
getPeriodicBoxSize
();
...
...
@@ -252,6 +356,11 @@ void OpenCLUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream&
stream
.
read
((
char
*
)
&
version
,
sizeof
(
int
));
if
(
version
!=
1
)
throw
OpenMMException
(
"Checkpoint was created with a different version of OpenMM"
);
int
precision
;
stream
.
read
((
char
*
)
&
precision
,
sizeof
(
int
));
int
expectedPrecision
=
(
cl
.
getUseDoublePrecision
()
?
2
:
cl
.
getUseMixedPrecision
()
?
1
:
0
);
if
(
precision
!=
expectedPrecision
)
throw
OpenMMException
(
"Checkpoint was created with a different numeric precision"
);
double
time
;
stream
.
read
((
char
*
)
&
time
,
sizeof
(
double
));
int
stepCount
,
computeForceCount
;
...
...
@@ -264,9 +373,13 @@ void OpenCLUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream&
contexts
[
i
]
->
setComputeForceCount
(
computeForceCount
);
}
char
*
buffer
=
(
char
*
)
cl
.
getPinnedBuffer
();
stream
.
read
(
buffer
,
sizeof
(
mm_float4
)
*
cl
.
getPosq
().
getSize
());
stream
.
read
(
buffer
,
cl
.
getPosq
().
getSize
(
)
*
cl
.
getPosq
().
get
Element
Size
());
cl
.
getPosq
().
upload
(
buffer
);
stream
.
read
(
buffer
,
sizeof
(
mm_float4
)
*
cl
.
getVelm
().
getSize
());
if
(
cl
.
getUseMixedPrecision
())
{
stream
.
read
(
buffer
,
cl
.
getPosqCorrection
().
getSize
()
*
cl
.
getPosqCorrection
().
getElementSize
());
cl
.
getPosqCorrection
().
upload
(
buffer
);
}
stream
.
read
(
buffer
,
cl
.
getVelm
().
getSize
()
*
cl
.
getVelm
().
getElementSize
());
cl
.
getVelm
().
upload
(
buffer
);
stream
.
read
((
char
*
)
&
cl
.
getAtomIndex
()[
0
],
sizeof
(
cl_int
)
*
cl
.
getAtomIndex
().
size
());
cl
.
getAtomIndexArray
().
upload
(
cl
.
getAtomIndex
());
...
...
@@ -292,7 +405,8 @@ void OpenCLApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
cl
::
Program
program
=
cl
.
createProgram
(
OpenCLKernelSources
::
constraints
,
defines
);
applyDeltasKernel
=
cl
::
Kernel
(
program
,
"applyPositionDeltas"
);
applyDeltasKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
applyDeltasKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getIntegrationUtilities
().
getPosDelta
().
getDeviceBuffer
());
setPosqCorrectionArg
(
cl
,
applyDeltasKernel
,
1
);
applyDeltasKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
cl
.
getIntegrationUtilities
().
getPosDelta
().
getDeviceBuffer
());
}
OpenCLIntegrationUtilities
&
integration
=
cl
.
getIntegrationUtilities
();
cl
.
clearBuffer
(
integration
.
getPosDelta
());
...
...
@@ -4000,19 +4114,28 @@ void OpenCLIntegrateVerletStepKernel::execute(ContextImpl& context, const Verlet
kernel1
.
setArg
<
cl_int
>
(
0
,
numAtoms
);
kernel1
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getIntegrationUtilities
().
getStepSize
().
getDeviceBuffer
());
kernel1
.
setArg
<
cl
::
Buffer
>
(
2
,
cl
.
getPosq
().
getDeviceBuffer
());
kernel1
.
setArg
<
cl
::
Buffer
>
(
3
,
cl
.
getVelm
().
getDeviceBuffer
());
kernel1
.
setArg
<
cl
::
Buffer
>
(
4
,
cl
.
getForce
().
getDeviceBuffer
());
kernel1
.
setArg
<
cl
::
Buffer
>
(
5
,
integration
.
getPosDelta
().
getDeviceBuffer
());
setPosqCorrectionArg
(
cl
,
kernel1
,
3
);
kernel1
.
setArg
<
cl
::
Buffer
>
(
4
,
cl
.
getVelm
().
getDeviceBuffer
());
kernel1
.
setArg
<
cl
::
Buffer
>
(
5
,
cl
.
getForce
().
getDeviceBuffer
());
kernel1
.
setArg
<
cl
::
Buffer
>
(
6
,
integration
.
getPosDelta
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl_int
>
(
0
,
numAtoms
);
kernel2
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getIntegrationUtilities
().
getStepSize
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
2
,
cl
.
getPosq
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
3
,
cl
.
getVelm
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
4
,
integration
.
getPosDelta
().
getDeviceBuffer
());
setPosqCorrectionArg
(
cl
,
kernel2
,
3
);
kernel2
.
setArg
<
cl
::
Buffer
>
(
4
,
cl
.
getVelm
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
5
,
integration
.
getPosDelta
().
getDeviceBuffer
());
}
if
(
dt
!=
prevStepSize
)
{
vector
<
mm_float2
>
stepSizeVec
(
1
);
stepSizeVec
[
0
]
=
mm_float2
((
cl_float
)
dt
,
(
cl_float
)
dt
);
cl
.
getIntegrationUtilities
().
getStepSize
().
upload
(
stepSizeVec
);
if
(
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
())
{
vector
<
mm_double2
>
stepSizeVec
(
1
);
stepSizeVec
[
0
]
=
mm_double2
(
dt
,
dt
);
cl
.
getIntegrationUtilities
().
getStepSize
().
upload
(
stepSizeVec
);
}
else
{
vector
<
mm_float2
>
stepSizeVec
(
1
);
stepSizeVec
[
0
]
=
mm_float2
((
cl_float
)
dt
,
(
cl_float
)
dt
);
cl
.
getIntegrationUtilities
().
getStepSize
().
upload
(
stepSizeVec
);
}
prevStepSize
=
dt
;
}
...
...
@@ -4055,7 +4178,7 @@ void OpenCLIntegrateLangevinStepKernel::initialize(const System& system, const L
cl
::
Program
program
=
cl
.
createProgram
(
OpenCLKernelSources
::
langevin
,
defines
,
""
);
kernel1
=
cl
::
Kernel
(
program
,
"integrateLangevinPart1"
);
kernel2
=
cl
::
Kernel
(
program
,
"integrateLangevinPart2"
);
params
=
OpenCLArray
::
create
<
cl_float
>
(
cl
,
3
,
"langevinParams"
);
params
=
new
OpenCLArray
(
cl
,
3
,
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
()
?
sizeof
(
cl_double
)
:
sizeof
(
cl_float
)
,
"langevinParams"
);
prevStepSize
=
-
1.0
;
}
...
...
@@ -4071,9 +4194,10 @@ void OpenCLIntegrateLangevinStepKernel::execute(ContextImpl& context, const Lang
kernel1
.
setArg
<
cl
::
Buffer
>
(
4
,
integration
.
getStepSize
().
getDeviceBuffer
());
kernel1
.
setArg
<
cl
::
Buffer
>
(
5
,
integration
.
getRandom
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
1
,
integration
.
getPosDelta
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
2
,
cl
.
getVelm
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
3
,
integration
.
getStepSize
().
getDeviceBuffer
());
setPosqCorrectionArg
(
cl
,
kernel2
,
1
);
kernel2
.
setArg
<
cl
::
Buffer
>
(
2
,
integration
.
getPosDelta
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
3
,
cl
.
getVelm
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
4
,
integration
.
getStepSize
().
getDeviceBuffer
());
}
double
temperature
=
integrator
.
getTemperature
();
double
friction
=
integrator
.
getFriction
();
...
...
@@ -4086,13 +4210,24 @@ void OpenCLIntegrateLangevinStepKernel::execute(ContextImpl& context, const Lang
double
vscale
=
exp
(
-
stepSize
/
tau
);
double
fscale
=
(
1
-
vscale
)
*
tau
;
double
noisescale
=
sqrt
(
2
*
kT
/
tau
)
*
sqrt
(
0.5
*
(
1
-
vscale
*
vscale
)
*
tau
);
vector
<
cl_float
>
p
(
params
->
getSize
());
p
[
0
]
=
(
cl_float
)
vscale
;
p
[
1
]
=
(
cl_float
)
fscale
;
p
[
2
]
=
(
cl_float
)
noisescale
;
params
->
upload
(
p
);
mm_float2
ss
=
mm_float2
(
0
,
(
float
)
stepSize
);
integration
.
getStepSize
().
upload
(
&
ss
);
if
(
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
())
{
vector
<
cl_double
>
p
(
params
->
getSize
());
p
[
0
]
=
vscale
;
p
[
1
]
=
fscale
;
p
[
2
]
=
noisescale
;
params
->
upload
(
p
);
mm_double2
ss
=
mm_double2
(
0
,
stepSize
);
integration
.
getStepSize
().
upload
(
&
ss
);
}
else
{
vector
<
cl_float
>
p
(
params
->
getSize
());
p
[
0
]
=
(
cl_float
)
vscale
;
p
[
1
]
=
(
cl_float
)
fscale
;
p
[
2
]
=
(
cl_float
)
noisescale
;
params
->
upload
(
p
);
mm_float2
ss
=
mm_float2
(
0
,
(
float
)
stepSize
);
integration
.
getStepSize
().
upload
(
&
ss
);
}
prevTemp
=
temperature
;
prevFriction
=
friction
;
prevStepSize
=
stepSize
;
...
...
@@ -4148,17 +4283,25 @@ void OpenCLIntegrateBrownianStepKernel::execute(ContextImpl& context, const Brow
kernel1
.
setArg
<
cl
::
Buffer
>
(
4
,
cl
.
getVelm
().
getDeviceBuffer
());
kernel1
.
setArg
<
cl
::
Buffer
>
(
5
,
integration
.
getRandom
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getPosq
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
2
,
cl
.
getVelm
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
3
,
integration
.
getPosDelta
().
getDeviceBuffer
());
setPosqCorrectionArg
(
cl
,
kernel2
,
2
);
kernel2
.
setArg
<
cl
::
Buffer
>
(
3
,
cl
.
getVelm
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
4
,
integration
.
getPosDelta
().
getDeviceBuffer
());
}
double
temperature
=
integrator
.
getTemperature
();
double
friction
=
integrator
.
getFriction
();
double
stepSize
=
integrator
.
getStepSize
();
if
(
temperature
!=
prevTemp
||
friction
!=
prevFriction
||
stepSize
!=
prevStepSize
)
{
double
tau
=
(
friction
==
0.0
?
0.0
:
1.0
/
friction
);
kernel1
.
setArg
<
cl_float
>
(
0
,
(
cl_float
)
(
tau
*
stepSize
));
kernel1
.
setArg
<
cl_float
>
(
1
,
(
cl_float
)
(
sqrt
(
2.0
f
*
BOLTZ
*
temperature
*
stepSize
*
tau
)));
kernel2
.
setArg
<
cl_float
>
(
0
,
(
cl_float
)
(
1.0
/
stepSize
));
if
(
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
())
{
kernel1
.
setArg
<
cl_double
>
(
0
,
tau
*
stepSize
);
kernel1
.
setArg
<
cl_double
>
(
1
,
sqrt
(
2.0
f
*
BOLTZ
*
temperature
*
stepSize
*
tau
));
kernel2
.
setArg
<
cl_double
>
(
0
,
1.0
/
stepSize
);
}
else
{
kernel1
.
setArg
<
cl_float
>
(
0
,
(
cl_float
)
(
tau
*
stepSize
));
kernel1
.
setArg
<
cl_float
>
(
1
,
(
cl_float
)
(
sqrt
(
2.0
f
*
BOLTZ
*
temperature
*
stepSize
*
tau
)));
kernel2
.
setArg
<
cl_float
>
(
0
,
(
cl_float
)
(
1.0
/
stepSize
));
}
prevTemp
=
temperature
;
prevFriction
=
friction
;
prevStepSize
=
stepSize
;
...
...
@@ -4205,19 +4348,22 @@ void OpenCLIntegrateVariableVerletStepKernel::initialize(const System& system, c
double
OpenCLIntegrateVariableVerletStepKernel
::
execute
(
ContextImpl
&
context
,
const
VariableVerletIntegrator
&
integrator
,
double
maxTime
)
{
OpenCLIntegrationUtilities
&
integration
=
cl
.
getIntegrationUtilities
();
int
numAtoms
=
cl
.
getNumAtoms
();
bool
useDouble
=
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
();
if
(
!
hasInitializedKernels
)
{
hasInitializedKernels
=
true
;
kernel1
.
setArg
<
cl_int
>
(
0
,
numAtoms
);
kernel1
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getIntegrationUtilities
().
getStepSize
().
getDeviceBuffer
());
kernel1
.
setArg
<
cl
::
Buffer
>
(
2
,
cl
.
getPosq
().
getDeviceBuffer
());
kernel1
.
setArg
<
cl
::
Buffer
>
(
3
,
cl
.
getVelm
().
getDeviceBuffer
());
kernel1
.
setArg
<
cl
::
Buffer
>
(
4
,
cl
.
getForce
().
getDeviceBuffer
());
kernel1
.
setArg
<
cl
::
Buffer
>
(
5
,
integration
.
getPosDelta
().
getDeviceBuffer
());
setPosqCorrectionArg
(
cl
,
kernel1
,
3
);
kernel1
.
setArg
<
cl
::
Buffer
>
(
4
,
cl
.
getVelm
().
getDeviceBuffer
());
kernel1
.
setArg
<
cl
::
Buffer
>
(
5
,
cl
.
getForce
().
getDeviceBuffer
());
kernel1
.
setArg
<
cl
::
Buffer
>
(
6
,
integration
.
getPosDelta
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl_int
>
(
0
,
numAtoms
);
kernel2
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getIntegrationUtilities
().
getStepSize
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
2
,
cl
.
getPosq
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
3
,
cl
.
getVelm
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
4
,
integration
.
getPosDelta
().
getDeviceBuffer
());
setPosqCorrectionArg
(
cl
,
kernel2
,
3
);
kernel2
.
setArg
<
cl
::
Buffer
>
(
4
,
cl
.
getVelm
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
5
,
integration
.
getPosDelta
().
getDeviceBuffer
());
selectSizeKernel
.
setArg
<
cl_int
>
(
0
,
numAtoms
);
selectSizeKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
cl
.
getIntegrationUtilities
().
getStepSize
().
getDeviceBuffer
());
selectSizeKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
cl
.
getVelm
().
getDeviceBuffer
());
...
...
@@ -4227,9 +4373,16 @@ double OpenCLIntegrateVariableVerletStepKernel::execute(ContextImpl& context, co
// Select the step size to use.
float
maxStepSize
=
(
float
)(
maxTime
-
cl
.
getTime
());
selectSizeKernel
.
setArg
<
cl_float
>
(
1
,
maxStepSize
);
selectSizeKernel
.
setArg
<
cl_float
>
(
2
,
(
cl_float
)
integrator
.
getErrorTolerance
());
double
maxStepSize
=
maxTime
-
cl
.
getTime
();
float
maxStepSizeFloat
=
(
float
)
maxStepSize
;
if
(
useDouble
)
{
selectSizeKernel
.
setArg
<
cl_double
>
(
1
,
maxStepSize
);
selectSizeKernel
.
setArg
<
cl_double
>
(
2
,
integrator
.
getErrorTolerance
());
}
else
{
selectSizeKernel
.
setArg
<
cl_float
>
(
1
,
maxStepSizeFloat
);
selectSizeKernel
.
setArg
<
cl_float
>
(
2
,
(
cl_float
)
integrator
.
getErrorTolerance
());
}
cl
.
executeKernel
(
selectSizeKernel
,
blockSize
,
blockSize
);
// Call the first integration kernel.
...
...
@@ -4253,12 +4406,23 @@ double OpenCLIntegrateVariableVerletStepKernel::execute(ContextImpl& context, co
// Update the time and step count.
mm_float2
stepSize
;
cl
.
getIntegrationUtilities
().
getStepSize
().
download
(
&
stepSize
);
double
dt
=
stepSize
.
y
;
double
time
=
cl
.
getTime
()
+
dt
;
if
(
dt
==
maxStepSize
)
time
=
maxTime
;
// Avoid round-off error
double
dt
,
time
;
if
(
useDouble
)
{
mm_double2
stepSize
;
cl
.
getIntegrationUtilities
().
getStepSize
().
download
(
&
stepSize
);
dt
=
stepSize
.
y
;
time
=
cl
.
getTime
()
+
dt
;
if
(
dt
==
maxStepSize
)
time
=
maxTime
;
// Avoid round-off error
}
else
{
mm_float2
stepSize
;
cl
.
getIntegrationUtilities
().
getStepSize
().
download
(
&
stepSize
);
dt
=
stepSize
.
y
;
time
=
cl
.
getTime
()
+
dt
;
if
(
dt
==
maxStepSizeFloat
)
time
=
maxTime
;
// Avoid round-off error
}
cl
.
setTime
(
time
);
cl
.
setStepCount
(
cl
.
getStepCount
()
+
1
);
return
dt
;
...
...
@@ -4279,7 +4443,7 @@ void OpenCLIntegrateVariableLangevinStepKernel::initialize(const System& system,
kernel1
=
cl
::
Kernel
(
program
,
"integrateLangevinPart1"
);
kernel2
=
cl
::
Kernel
(
program
,
"integrateLangevinPart2"
);
selectSizeKernel
=
cl
::
Kernel
(
program
,
"selectLangevinStepSize"
);
params
=
OpenCLArray
::
create
<
cl_float
>
(
cl
,
3
,
"langevinParams"
);
params
=
new
OpenCLArray
(
cl
,
3
,
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
()
?
sizeof
(
cl_double
)
:
sizeof
(
cl_float
)
,
"langevinParams"
);
blockSize
=
min
(
256
,
system
.
getNumParticles
());
blockSize
=
max
(
blockSize
,
params
->
getSize
());
blockSize
=
min
(
blockSize
,
(
int
)
cl
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_WORK_GROUP_SIZE
>
());
...
...
@@ -4288,6 +4452,7 @@ void OpenCLIntegrateVariableLangevinStepKernel::initialize(const System& system,
double
OpenCLIntegrateVariableLangevinStepKernel
::
execute
(
ContextImpl
&
context
,
const
VariableLangevinIntegrator
&
integrator
,
double
maxTime
)
{
OpenCLIntegrationUtilities
&
integration
=
cl
.
getIntegrationUtilities
();
int
numAtoms
=
cl
.
getNumAtoms
();
bool
useDouble
=
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
();
if
(
!
hasInitializedKernels
)
{
hasInitializedKernels
=
true
;
kernel1
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getVelm
().
getDeviceBuffer
());
...
...
@@ -4297,9 +4462,10 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,
kernel1
.
setArg
<
cl
::
Buffer
>
(
4
,
integration
.
getStepSize
().
getDeviceBuffer
());
kernel1
.
setArg
<
cl
::
Buffer
>
(
5
,
integration
.
getRandom
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
1
,
integration
.
getPosDelta
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
2
,
cl
.
getVelm
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
3
,
integration
.
getStepSize
().
getDeviceBuffer
());
setPosqCorrectionArg
(
cl
,
kernel2
,
1
);
kernel2
.
setArg
<
cl
::
Buffer
>
(
2
,
integration
.
getPosDelta
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
3
,
cl
.
getVelm
().
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
4
,
integration
.
getStepSize
().
getDeviceBuffer
());
selectSizeKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
integration
.
getStepSize
().
getDeviceBuffer
());
selectSizeKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
cl
.
getVelm
().
getDeviceBuffer
());
selectSizeKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
cl
.
getForce
().
getDeviceBuffer
());
...
...
@@ -4310,11 +4476,20 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,
// Select the step size to use.
float
maxStepSize
=
(
float
)(
maxTime
-
cl
.
getTime
());
selectSizeKernel
.
setArg
<
cl_float
>
(
0
,
maxStepSize
);
selectSizeKernel
.
setArg
<
cl_float
>
(
1
,
(
cl_float
)
integrator
.
getErrorTolerance
());
selectSizeKernel
.
setArg
<
cl_float
>
(
2
,
(
cl_float
)
(
integrator
.
getFriction
()
==
0.0
?
0.0
:
1.0
/
integrator
.
getFriction
()));
selectSizeKernel
.
setArg
<
cl_float
>
(
3
,
(
cl_float
)
(
BOLTZ
*
integrator
.
getTemperature
()));
double
maxStepSize
=
maxTime
-
cl
.
getTime
();
float
maxStepSizeFloat
=
(
float
)
maxStepSize
;
if
(
useDouble
)
{
selectSizeKernel
.
setArg
<
cl_double
>
(
0
,
maxStepSize
);
selectSizeKernel
.
setArg
<
cl_double
>
(
1
,
integrator
.
getErrorTolerance
());
selectSizeKernel
.
setArg
<
cl_double
>
(
2
,
integrator
.
getFriction
()
==
0.0
?
0.0
:
1.0
/
integrator
.
getFriction
());
selectSizeKernel
.
setArg
<
cl_double
>
(
3
,
BOLTZ
*
integrator
.
getTemperature
());
}
else
{
selectSizeKernel
.
setArg
<
cl_float
>
(
0
,
maxStepSizeFloat
);
selectSizeKernel
.
setArg
<
cl_float
>
(
1
,
(
cl_float
)
integrator
.
getErrorTolerance
());
selectSizeKernel
.
setArg
<
cl_float
>
(
2
,
(
cl_float
)
(
integrator
.
getFriction
()
==
0.0
?
0.0
:
1.0
/
integrator
.
getFriction
()));
selectSizeKernel
.
setArg
<
cl_float
>
(
3
,
(
cl_float
)
(
BOLTZ
*
integrator
.
getTemperature
()));
}
cl
.
executeKernel
(
selectSizeKernel
,
blockSize
,
blockSize
);
// Call the first integration kernel.
...
...
@@ -4339,12 +4514,23 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,
// Update the time and step count.
mm_float2
stepSize
;
cl
.
getIntegrationUtilities
().
getStepSize
().
download
(
&
stepSize
);
double
dt
=
stepSize
.
y
;
double
time
=
cl
.
getTime
()
+
dt
;
if
(
dt
==
maxStepSize
)
time
=
maxTime
;
// Avoid round-off error
double
dt
,
time
;
if
(
useDouble
)
{
mm_double2
stepSize
;
cl
.
getIntegrationUtilities
().
getStepSize
().
download
(
&
stepSize
);
dt
=
stepSize
.
y
;
time
=
cl
.
getTime
()
+
dt
;
if
(
dt
==
maxStepSize
)
time
=
maxTime
;
// Avoid round-off error
}
else
{
mm_float2
stepSize
;
cl
.
getIntegrationUtilities
().
getStepSize
().
download
(
&
stepSize
);
dt
=
stepSize
.
y
;
time
=
cl
.
getTime
()
+
dt
;
if
(
dt
==
maxStepSizeFloat
)
time
=
maxTime
;
// Avoid round-off error
}
cl
.
setTime
(
time
);
cl
.
setStepCount
(
cl
.
getStepCount
()
+
1
);
return
dt
;
...
...
@@ -4352,8 +4538,8 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,
class
OpenCLIntegrateCustomStepKernel
::
ReorderListener
:
public
OpenCLContext
::
ReorderListener
{
public:
ReorderListener
(
OpenCLContext
&
cl
,
OpenCLParameterSet
&
perDofValues
,
vector
<
vector
<
cl_float
>
>&
localPerDofValues
,
bool
&
deviceValuesAreCurrent
)
:
cl
(
cl
),
perDofValues
(
perDofValues
),
localPerDofValues
(
localPerDofValues
),
deviceValuesAreCurrent
(
deviceValuesAreCurrent
)
{
ReorderListener
(
OpenCLContext
&
cl
,
OpenCLParameterSet
&
perDofValues
,
vector
<
vector
<
cl_float
>
>&
localPerDofValues
Float
,
vector
<
vector
<
cl_double
>
>&
localPerDofValuesDouble
,
bool
&
deviceValuesAreCurrent
)
:
cl
(
cl
),
perDofValues
(
perDofValues
),
localPerDofValues
Float
(
localPerDofValues
Float
),
localPerDofValuesDouble
(
localPerDofValuesDouble
),
deviceValuesAreCurrent
(
deviceValuesAreCurrent
)
{
int
numAtoms
=
cl
.
getNumAtoms
();
lastAtomOrder
.
resize
(
numAtoms
);
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
...
...
@@ -4365,21 +4551,39 @@ public:
if
(
perDofValues
.
getNumParameters
()
==
0
)
return
;
int
numAtoms
=
cl
.
getNumAtoms
();
if
(
deviceValuesAreCurrent
)
perDofValues
.
getParameterValues
(
localPerDofValues
);
vector
<
vector
<
cl_float
>
>
swap
(
3
*
numAtoms
);
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
swap
[
3
*
lastAtomOrder
[
i
]]
=
localPerDofValues
[
3
*
i
];
swap
[
3
*
lastAtomOrder
[
i
]
+
1
]
=
localPerDofValues
[
3
*
i
+
1
];
swap
[
3
*
lastAtomOrder
[
i
]
+
2
]
=
localPerDofValues
[
3
*
i
+
2
];
}
const
vector
<
cl_int
>&
order
=
cl
.
getAtomIndex
();
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
localPerDofValues
[
3
*
i
]
=
swap
[
3
*
order
[
i
]];
localPerDofValues
[
3
*
i
+
1
]
=
swap
[
3
*
order
[
i
]
+
1
];
localPerDofValues
[
3
*
i
+
2
]
=
swap
[
3
*
order
[
i
]
+
2
];
}
perDofValues
.
setParameterValues
(
localPerDofValues
);
const
vector
<
int
>&
order
=
cl
.
getAtomIndex
();
if
(
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
())
{
if
(
deviceValuesAreCurrent
)
perDofValues
.
getParameterValues
(
localPerDofValuesDouble
);
vector
<
vector
<
cl_double
>
>
swap
(
3
*
numAtoms
);
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
swap
[
3
*
lastAtomOrder
[
i
]]
=
localPerDofValuesDouble
[
3
*
i
];
swap
[
3
*
lastAtomOrder
[
i
]
+
1
]
=
localPerDofValuesDouble
[
3
*
i
+
1
];
swap
[
3
*
lastAtomOrder
[
i
]
+
2
]
=
localPerDofValuesDouble
[
3
*
i
+
2
];
}
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
localPerDofValuesDouble
[
3
*
i
]
=
swap
[
3
*
order
[
i
]];
localPerDofValuesDouble
[
3
*
i
+
1
]
=
swap
[
3
*
order
[
i
]
+
1
];
localPerDofValuesDouble
[
3
*
i
+
2
]
=
swap
[
3
*
order
[
i
]
+
2
];
}
perDofValues
.
setParameterValues
(
localPerDofValuesDouble
);
}
else
{
if
(
deviceValuesAreCurrent
)
perDofValues
.
getParameterValues
(
localPerDofValuesFloat
);
vector
<
vector
<
cl_float
>
>
swap
(
3
*
numAtoms
);
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
swap
[
3
*
lastAtomOrder
[
i
]]
=
localPerDofValuesFloat
[
3
*
i
];
swap
[
3
*
lastAtomOrder
[
i
]
+
1
]
=
localPerDofValuesFloat
[
3
*
i
+
1
];
swap
[
3
*
lastAtomOrder
[
i
]
+
2
]
=
localPerDofValuesFloat
[
3
*
i
+
2
];
}
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
localPerDofValuesFloat
[
3
*
i
]
=
swap
[
3
*
order
[
i
]];
localPerDofValuesFloat
[
3
*
i
+
1
]
=
swap
[
3
*
order
[
i
]
+
1
];
localPerDofValuesFloat
[
3
*
i
+
2
]
=
swap
[
3
*
order
[
i
]
+
2
];
}
perDofValues
.
setParameterValues
(
localPerDofValuesFloat
);
}
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
lastAtomOrder
[
i
]
=
order
[
i
];
deviceValuesAreCurrent
=
true
;
...
...
@@ -4387,7 +4591,8 @@ public:
private:
OpenCLContext
&
cl
;
OpenCLParameterSet
&
perDofValues
;
vector
<
vector
<
cl_float
>
>&
localPerDofValues
;
vector
<
vector
<
cl_float
>
>&
localPerDofValuesFloat
;
vector
<
vector
<
cl_double
>
>&
localPerDofValuesDouble
;
bool
&
deviceValuesAreCurrent
;
vector
<
int
>
lastAtomOrder
;
};
...
...
@@ -4413,11 +4618,12 @@ void OpenCLIntegrateCustomStepKernel::initialize(const System& system, const Cus
cl
.
getPlatformData
().
initializeContexts
(
system
);
cl
.
getIntegrationUtilities
().
initRandomNumberGenerator
(
integrator
.
getRandomNumberSeed
());
numGlobalVariables
=
integrator
.
getNumGlobalVariables
();
globalValues
=
OpenCLArray
::
create
<
cl_float
>
(
cl
,
max
(
1
,
numGlobalVariables
),
"globalVariables"
);
sumBuffer
=
OpenCLArray
::
create
<
cl_float
>
(
cl
,
3
*
system
.
getNumParticles
(),
"sumBuffer"
);
energy
=
OpenCLArray
::
create
<
cl_float
>
(
cl
,
1
,
"energy"
);
perDofValues
=
new
OpenCLParameterSet
(
cl
,
integrator
.
getNumPerDofVariables
(),
3
*
system
.
getNumParticles
(),
"perDofVariables"
);
cl
.
addReorderListener
(
new
ReorderListener
(
cl
,
*
perDofValues
,
localPerDofValues
,
deviceValuesAreCurrent
));
int
elementSize
=
(
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
globalValues
=
new
OpenCLArray
(
cl
,
max
(
1
,
numGlobalVariables
),
elementSize
,
"globalVariables"
);
sumBuffer
=
new
OpenCLArray
(
cl
,
3
*
system
.
getNumParticles
(),
elementSize
,
"sumBuffer"
);
energy
=
new
OpenCLArray
(
cl
,
1
,
elementSize
,
"energy"
);
perDofValues
=
new
OpenCLParameterSet
(
cl
,
integrator
.
getNumPerDofVariables
(),
3
*
system
.
getNumParticles
(),
"perDofVariables"
,
false
,
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
());
cl
.
addReorderListener
(
new
ReorderListener
(
cl
,
*
perDofValues
,
localPerDofValuesFloat
,
localPerDofValuesDouble
,
deviceValuesAreCurrent
));
prevStepSize
=
-
1.0
;
SimTKOpenMMUtilities
::
setRandomNumberSeed
(
integrator
.
getRandomNumberSeed
());
}
...
...
@@ -4492,19 +4698,31 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
OpenCLIntegrationUtilities
&
integration
=
cl
.
getIntegrationUtilities
();
int
numAtoms
=
cl
.
getNumAtoms
();
int
numSteps
=
integrator
.
getNumComputations
();
bool
useDouble
=
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
();
if
(
!
hasInitializedKernels
)
{
hasInitializedKernels
=
true
;
// Initialize various data structures.
const
map
<
string
,
double
>&
params
=
context
.
getParameters
();
contextParameterValues
=
OpenCLArray
::
create
<
cl_float
>
(
cl
,
max
(
1
,
(
int
)
params
.
size
()),
"contextParameters"
);
contextValues
.
resize
(
contextParameterValues
->
getSize
());
for
(
map
<
string
,
double
>::
const_iterator
iter
=
params
.
begin
();
iter
!=
params
.
end
();
++
iter
)
{
contextValues
[
parameterNames
.
size
()]
=
(
float
)
iter
->
second
;
parameterNames
.
push_back
(
iter
->
first
);
if
(
useDouble
)
{
contextParameterValues
=
OpenCLArray
::
create
<
cl_double
>
(
cl
,
max
(
1
,
(
int
)
params
.
size
()),
"contextParameters"
);
contextValuesDouble
.
resize
(
contextParameterValues
->
getSize
());
for
(
map
<
string
,
double
>::
const_iterator
iter
=
params
.
begin
();
iter
!=
params
.
end
();
++
iter
)
{
contextValuesDouble
[
parameterNames
.
size
()]
=
iter
->
second
;
parameterNames
.
push_back
(
iter
->
first
);
}
contextParameterValues
->
upload
(
contextValuesDouble
);
}
else
{
contextParameterValues
=
OpenCLArray
::
create
<
cl_float
>
(
cl
,
max
(
1
,
(
int
)
params
.
size
()),
"contextParameters"
);
contextValuesFloat
.
resize
(
contextParameterValues
->
getSize
());
for
(
map
<
string
,
double
>::
const_iterator
iter
=
params
.
begin
();
iter
!=
params
.
end
();
++
iter
)
{
contextValuesFloat
[
parameterNames
.
size
()]
=
(
float
)
iter
->
second
;
parameterNames
.
push_back
(
iter
->
first
);
}
contextParameterValues
->
upload
(
contextValuesFloat
);
}
contextParameterValues
->
upload
(
contextValues
);
kernels
.
resize
(
integrator
.
getNumComputations
());
requiredGaussian
.
resize
(
integrator
.
getNumComputations
(),
0
);
requiredUniform
.
resize
(
integrator
.
getNumComputations
(),
0
);
...
...
@@ -4644,7 +4862,6 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
compute
<<
buffer
.
getType
()
<<
" perDofy"
<<
intToString
(
i
+
1
)
<<
" = perDofValues"
<<
intToString
(
i
+
1
)
<<
"[3*index+1];
\n
"
;
compute
<<
buffer
.
getType
()
<<
" perDofz"
<<
intToString
(
i
+
1
)
<<
" = perDofValues"
<<
intToString
(
i
+
1
)
<<
"[3*index+2];
\n
"
;
}
string
convert
=
(
cl
.
getSupportsDoublePrecision
()
?
"convert_float4("
:
"("
);
int
numGaussian
=
0
,
numUniform
=
0
;
for
(
int
j
=
step
;
j
<
numSteps
&&
(
j
==
step
||
merged
[
j
]);
j
++
)
{
compute
<<
"{
\n
"
;
...
...
@@ -4653,15 +4870,15 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
if
(
variable
[
j
]
==
"x"
)
{
if
(
storePosAsDelta
[
j
])
{
if
(
cl
.
getSupportsDoublePrecision
())
compute
<<
"posDelta[index] = convert_
float
4(position-convert_double4(
posq[
index
]
));
\n
"
;
compute
<<
"posDelta[index] = convert_
mixed4(convert_double
4(position
)
-convert_double4(
loadPos(posq, posqCorrection,
index
)
));
\n
"
;
else
compute
<<
"posDelta[index] = position-posq[index];
\n
"
;
}
else
compute
<<
"
posq[index] = "
<<
convert
<<
"
position);
\n
"
;
compute
<<
"
storePos(posq, posqCorrection, index,
position);
\n
"
;
}
else
if
(
variable
[
j
]
==
"v"
)
compute
<<
"velm[index] =
"
<<
convert
<<
"
velocity);
\n
"
;
compute
<<
"velm[index] = convert
_mixed4(
velocity);
\n
"
;
else
{
for
(
int
i
=
0
;
i
<
(
int
)
perDofValues
->
getBuffers
().
size
();
i
++
)
{
const
OpenCLNonbondedUtilities
::
ParameterInfo
&
buffer
=
perDofValues
->
getBuffers
()[
i
];
...
...
@@ -4694,6 +4911,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
requiredUniform
[
step
]
=
numUniform
;
int
index
=
0
;
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getPosq
().
getDeviceBuffer
());
setPosqCorrectionArg
(
cl
,
kernel
,
index
++
);
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
integration
.
getPosDelta
().
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getVelm
().
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getForce
().
getDeviceBuffer
());
...
...
@@ -4711,7 +4929,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
// Create a second kernel for this step that sums the values.
program
=
cl
.
createProgram
(
OpenCLKernelSources
::
customIntegrator
,
defines
);
kernel
=
cl
::
Kernel
(
program
,
"compute
Sum"
);
kernel
=
cl
::
Kernel
(
program
,
useDouble
?
"computeDoubleSum"
:
"computeFloat
Sum"
);
kernels
[
step
].
push_back
(
kernel
);
index
=
0
;
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
sumBuffer
->
getDeviceBuffer
());
...
...
@@ -4760,6 +4978,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
kernels
[
step
].
push_back
(
kernel
);
int
index
=
0
;
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getPosq
().
getDeviceBuffer
());
setPosqCorrectionArg
(
cl
,
kernel
,
index
++
);
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
integration
.
getPosDelta
().
getDeviceBuffer
());
}
}
...
...
@@ -4767,7 +4986,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
// Create the kernel for summing energy.
cl
::
Program
program
=
cl
.
createProgram
(
OpenCLKernelSources
::
customIntegrator
,
defines
);
sumEnergyKernel
=
cl
::
Kernel
(
program
,
"compute
Sum"
);
sumEnergyKernel
=
cl
::
Kernel
(
program
,
cl
.
getUseDoublePrecision
()
?
"computeDoubleSum"
:
"computeFloat
Sum"
);
int
index
=
0
;
sumEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
cl
.
getEnergyBuffer
().
getDeviceBuffer
());
sumEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
energy
->
getDeviceBuffer
());
...
...
@@ -4778,26 +4997,48 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
// Make sure all values (variables, parameters, etc.) stored on the device are up to date.
if
(
!
deviceValuesAreCurrent
)
{
perDofValues
->
setParameterValues
(
localPerDofValues
);
if
(
useDouble
)
perDofValues
->
setParameterValues
(
localPerDofValuesDouble
);
else
perDofValues
->
setParameterValues
(
localPerDofValuesFloat
);
deviceValuesAreCurrent
=
true
;
}
localValuesAreCurrent
=
false
;
double
stepSize
=
integrator
.
getStepSize
();
if
(
stepSize
!=
prevStepSize
)
{
mm_float2
ss
=
mm_float2
(
0
,
(
float
)
stepSize
);
integration
.
getStepSize
().
upload
(
&
ss
);
if
(
useDouble
)
{
mm_double2
ss
=
mm_double2
(
0
,
stepSize
);
integration
.
getStepSize
().
upload
(
&
ss
);
}
else
{
mm_float2
ss
=
mm_float2
(
0
,
(
float
)
stepSize
);
integration
.
getStepSize
().
upload
(
&
ss
);
}
prevStepSize
=
stepSize
;
}
bool
paramsChanged
=
false
;
for
(
int
i
=
0
;
i
<
(
int
)
parameterNames
.
size
();
i
++
)
{
float
value
=
(
float
)
context
.
getParameter
(
parameterNames
[
i
]);
if
(
value
!=
contextValues
[
i
])
{
contextValues
[
i
]
=
value
;
paramsChanged
=
true
;
if
(
useDouble
)
{
for
(
int
i
=
0
;
i
<
(
int
)
parameterNames
.
size
();
i
++
)
{
double
value
=
context
.
getParameter
(
parameterNames
[
i
]);
if
(
value
!=
contextValuesDouble
[
i
])
{
contextValuesDouble
[
i
]
=
value
;
paramsChanged
=
true
;
}
}
if
(
paramsChanged
)
contextParameterValues
->
upload
(
contextValuesDouble
);
}
else
{
for
(
int
i
=
0
;
i
<
(
int
)
parameterNames
.
size
();
i
++
)
{
float
value
=
(
float
)
context
.
getParameter
(
parameterNames
[
i
]);
if
(
value
!=
contextValuesFloat
[
i
])
{
contextValuesFloat
[
i
]
=
value
;
paramsChanged
=
true
;
}
}
if
(
paramsChanged
)
contextParameterValues
->
upload
(
contextValuesFloat
);
}
if
(
paramsChanged
)
contextParameterValues
->
upload
(
contextValues
);
// Loop over computation steps in the integrator and execute them.
...
...
@@ -4826,7 +5067,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
forcesAreValid
=
true
;
}
if
(
stepType
[
i
]
==
CustomIntegrator
::
ComputePerDof
&&
!
merged
[
i
])
{
kernels
[
i
][
0
].
setArg
<
cl_uint
>
(
9
,
integration
.
prepareRandomNumbers
(
requiredGaussian
[
i
]));
kernels
[
i
][
0
].
setArg
<
cl_uint
>
(
10
,
integration
.
prepareRandomNumbers
(
requiredGaussian
[
i
]));
if
(
requiredUniform
[
i
]
>
0
)
cl
.
executeKernel
(
randomKernel
,
numAtoms
);
cl
.
executeKernel
(
kernels
[
i
][
0
],
numAtoms
);
...
...
@@ -4837,7 +5078,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
cl
.
executeKernel
(
kernels
[
i
][
0
],
1
,
1
);
}
else
if
(
stepType
[
i
]
==
CustomIntegrator
::
ComputeSum
)
{
kernels
[
i
][
0
].
setArg
<
cl_uint
>
(
9
,
integration
.
prepareRandomNumbers
(
requiredGaussian
[
i
]));
kernels
[
i
][
0
].
setArg
<
cl_uint
>
(
10
,
integration
.
prepareRandomNumbers
(
requiredGaussian
[
i
]));
if
(
requiredUniform
[
i
]
>
0
)
cl
.
executeKernel
(
randomKernel
,
numAtoms
);
cl
.
executeKernel
(
kernels
[
i
][
0
],
numAtoms
);
...
...
@@ -4875,11 +5116,21 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
void
OpenCLIntegrateCustomStepKernel
::
recordChangedParameters
(
ContextImpl
&
context
)
{
if
(
!
modifiesParameters
)
return
;
contextParameterValues
->
download
(
contextValues
);
for
(
int
i
=
0
;
i
<
(
int
)
parameterNames
.
size
();
i
++
)
{
float
value
=
(
float
)
context
.
getParameter
(
parameterNames
[
i
]);
if
(
value
!=
contextValues
[
i
])
context
.
setParameter
(
parameterNames
[
i
],
contextValues
[
i
]);
if
(
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
())
{
contextParameterValues
->
download
(
contextValuesDouble
);
for
(
int
i
=
0
;
i
<
(
int
)
parameterNames
.
size
();
i
++
)
{
double
value
=
context
.
getParameter
(
parameterNames
[
i
]);
if
(
value
!=
contextValuesDouble
[
i
])
context
.
setParameter
(
parameterNames
[
i
],
contextValuesDouble
[
i
]);
}
}
else
{
contextParameterValues
->
download
(
contextValuesFloat
);
for
(
int
i
=
0
;
i
<
(
int
)
parameterNames
.
size
();
i
++
)
{
float
value
=
(
float
)
context
.
getParameter
(
parameterNames
[
i
]);
if
(
value
!=
contextValuesFloat
[
i
])
context
.
setParameter
(
parameterNames
[
i
],
contextValuesFloat
[
i
]);
}
}
}
...
...
@@ -4888,43 +5139,72 @@ void OpenCLIntegrateCustomStepKernel::getGlobalVariables(ContextImpl& context, v
values
.
resize
(
0
);
return
;
}
vector
<
cl_float
>
buffer
;
globalValues
->
download
(
buffer
);
values
.
resize
(
numGlobalVariables
);
for
(
int
i
=
0
;
i
<
numGlobalVariables
;
i
++
)
values
[
i
]
=
buffer
[
i
];
if
(
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
())
globalValues
->
download
(
values
);
else
{
vector
<
cl_float
>
buffer
;
globalValues
->
download
(
buffer
);
for
(
int
i
=
0
;
i
<
numGlobalVariables
;
i
++
)
values
[
i
]
=
buffer
[
i
];
}
}
void
OpenCLIntegrateCustomStepKernel
::
setGlobalVariables
(
ContextImpl
&
context
,
const
vector
<
double
>&
values
)
{
if
(
numGlobalVariables
==
0
)
return
;
vector
<
cl_float
>
valuesVec
(
numGlobalVariables
);
for
(
int
i
=
0
;
i
<
numGlobalVariables
;
i
++
)
valuesVec
[
i
]
=
(
float
)
values
[
i
];
globalValues
->
upload
(
valuesVec
);
if
(
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
())
globalValues
->
upload
(
values
);
else
{
vector
<
cl_float
>
buffer
(
numGlobalVariables
);
for
(
int
i
=
0
;
i
<
numGlobalVariables
;
i
++
)
buffer
[
i
]
=
(
cl_float
)
values
[
i
];
globalValues
->
upload
(
buffer
);
}
}
void
OpenCLIntegrateCustomStepKernel
::
getPerDofVariable
(
ContextImpl
&
context
,
int
variable
,
vector
<
Vec3
>&
values
)
const
{
if
(
!
localValuesAreCurrent
)
{
perDofValues
->
getParameterValues
(
localPerDofValues
);
localValuesAreCurrent
=
true
;
}
values
.
resize
(
perDofValues
->
getNumObjects
()
/
3
);
const
vector
<
cl_int
>&
order
=
cl
.
getAtomIndex
();
for
(
int
i
=
0
;
i
<
(
int
)
values
.
size
();
i
++
)
for
(
int
j
=
0
;
j
<
3
;
j
++
)
values
[
order
[
i
]][
j
]
=
localPerDofValues
[
3
*
i
+
j
][
variable
];
const
vector
<
int
>&
order
=
cl
.
getAtomIndex
();
if
(
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
())
{
if
(
!
localValuesAreCurrent
)
{
perDofValues
->
getParameterValues
(
localPerDofValuesDouble
);
localValuesAreCurrent
=
true
;
}
for
(
int
i
=
0
;
i
<
(
int
)
values
.
size
();
i
++
)
for
(
int
j
=
0
;
j
<
3
;
j
++
)
values
[
order
[
i
]][
j
]
=
localPerDofValuesDouble
[
3
*
i
+
j
][
variable
];
}
else
{
if
(
!
localValuesAreCurrent
)
{
perDofValues
->
getParameterValues
(
localPerDofValuesFloat
);
localValuesAreCurrent
=
true
;
}
for
(
int
i
=
0
;
i
<
(
int
)
values
.
size
();
i
++
)
for
(
int
j
=
0
;
j
<
3
;
j
++
)
values
[
order
[
i
]][
j
]
=
localPerDofValuesFloat
[
3
*
i
+
j
][
variable
];
}
}
void
OpenCLIntegrateCustomStepKernel
::
setPerDofVariable
(
ContextImpl
&
context
,
int
variable
,
const
vector
<
Vec3
>&
values
)
{
if
(
!
localValuesAreCurrent
)
{
perDofValues
->
getParameterValues
(
localPerDofValues
);
localValuesAreCurrent
=
true
;
const
vector
<
int
>&
order
=
cl
.
getAtomIndex
();
if
(
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
())
{
if
(
!
localValuesAreCurrent
)
{
perDofValues
->
getParameterValues
(
localPerDofValuesDouble
);
localValuesAreCurrent
=
true
;
}
for
(
int
i
=
0
;
i
<
(
int
)
values
.
size
();
i
++
)
for
(
int
j
=
0
;
j
<
3
;
j
++
)
localPerDofValuesDouble
[
3
*
i
+
j
][
variable
]
=
values
[
order
[
i
]][
j
];
}
else
{
if
(
!
localValuesAreCurrent
)
{
perDofValues
->
getParameterValues
(
localPerDofValuesFloat
);
localValuesAreCurrent
=
true
;
}
for
(
int
i
=
0
;
i
<
(
int
)
values
.
size
();
i
++
)
for
(
int
j
=
0
;
j
<
3
;
j
++
)
localPerDofValuesFloat
[
3
*
i
+
j
][
variable
]
=
(
float
)
values
[
order
[
i
]][
j
];
}
const
vector
<
cl_int
>&
order
=
cl
.
getAtomIndex
();
for
(
int
i
=
0
;
i
<
(
int
)
values
.
size
();
i
++
)
for
(
int
j
=
0
;
j
<
3
;
j
++
)
localPerDofValues
[
3
*
i
+
j
][
variable
]
=
(
float
)
values
[
order
[
i
]][
j
];
deviceValuesAreCurrent
=
false
;
}
...
...
@@ -5035,13 +5315,23 @@ double OpenCLCalcKineticEnergyKernel::execute(ContextImpl& context) {
// We don't currently have a GPU kernel to do this, so we retrieve the velocities and calculate the energy
// on the CPU.
mm_float4
*
velm
=
(
mm_float4
*
)
cl
.
getPinnedBuffer
();
cl
.
getVelm
().
download
(
velm
);
double
energy
=
0.0
;
const
vector
<
cl_int
>&
order
=
cl
.
getAtomIndex
();
for
(
size_t
i
=
0
;
i
<
masses
.
size
();
++
i
)
{
mm_float4
v
=
velm
[
i
];
energy
+=
masses
[
order
[
i
]]
*
(
v
.
x
*
v
.
x
+
v
.
y
*
v
.
y
+
v
.
z
*
v
.
z
);
double
energy
=
0.0
;
if
(
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
())
{
mm_double4
*
velm
=
(
mm_double4
*
)
cl
.
getPinnedBuffer
();
cl
.
getVelm
().
download
(
velm
);
for
(
size_t
i
=
0
;
i
<
masses
.
size
();
++
i
)
{
mm_double4
v
=
velm
[
i
];
energy
+=
masses
[
order
[
i
]]
*
(
v
.
x
*
v
.
x
+
v
.
y
*
v
.
y
+
v
.
z
*
v
.
z
);
}
}
else
{
mm_float4
*
velm
=
(
mm_float4
*
)
cl
.
getPinnedBuffer
();
cl
.
getVelm
().
download
(
velm
);
for
(
size_t
i
=
0
;
i
<
masses
.
size
();
++
i
)
{
mm_float4
v
=
velm
[
i
];
energy
+=
masses
[
order
[
i
]]
*
(
v
.
x
*
v
.
x
+
v
.
y
*
v
.
y
+
v
.
z
*
v
.
z
);
}
}
return
0.5
*
energy
;
}
...
...
platforms/opencl/src/OpenCLKernels.h
View file @
8d6a2a01
...
...
@@ -1145,7 +1145,10 @@ private:
OpenCLArray
*
uniformRandoms
;
OpenCLArray
*
randomSeed
;
OpenCLParameterSet
*
perDofValues
;
mutable
std
::
vector
<
std
::
vector
<
cl_float
>
>
localPerDofValues
;
mutable
std
::
vector
<
std
::
vector
<
cl_float
>
>
localPerDofValuesFloat
;
mutable
std
::
vector
<
std
::
vector
<
cl_double
>
>
localPerDofValuesDouble
;
std
::
vector
<
float
>
contextValuesFloat
;
std
::
vector
<
double
>
contextValuesDouble
;
std
::
vector
<
float
>
contextValues
;
std
::
vector
<
std
::
vector
<
cl
::
Kernel
>
>
kernels
;
cl
::
Kernel
sumEnergyKernel
,
randomKernel
;
...
...
platforms/opencl/src/OpenCLParameterSet.cpp
View file @
8d6a2a01
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors.
*
* Portions copyright (c) 2009
-2012
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -32,32 +32,34 @@
using
namespace
OpenMM
;
using
namespace
std
;
OpenCLParameterSet
::
OpenCLParameterSet
(
OpenCLContext
&
context
,
int
numParameters
,
int
numObjects
,
const
string
&
name
,
bool
bufferPerParameter
)
:
OpenCLParameterSet
::
OpenCLParameterSet
(
OpenCLContext
&
context
,
int
numParameters
,
int
numObjects
,
const
string
&
name
,
bool
bufferPerParameter
,
bool
useDoublePrecision
)
:
context
(
context
),
numParameters
(
numParameters
),
numObjects
(
numObjects
),
name
(
name
)
{
int
params
=
numParameters
;
int
bufferCount
=
0
;
elementSize
=
(
useDoublePrecision
?
sizeof
(
double
)
:
sizeof
(
float
));
string
elementType
=
(
useDoublePrecision
?
"double"
:
"float"
);
try
{
if
(
!
bufferPerParameter
)
{
while
(
params
>
2
)
{
cl
::
Buffer
*
buf
=
new
cl
::
Buffer
(
context
.
getContext
(),
CL_MEM_READ_WRITE
,
numObjects
*
sizeof
(
mm_float4
)
);
cl
::
Buffer
*
buf
=
new
cl
::
Buffer
(
context
.
getContext
(),
CL_MEM_READ_WRITE
,
numObjects
*
elementSize
*
4
);
std
::
stringstream
name
;
name
<<
"param"
<<
(
++
bufferCount
);
buffers
.
push_back
(
OpenCLNonbondedUtilities
::
ParameterInfo
(
name
.
str
(),
"float"
,
4
,
sizeof
(
mm_float4
)
,
*
buf
));
buffers
.
push_back
(
OpenCLNonbondedUtilities
::
ParameterInfo
(
name
.
str
(),
elementType
,
4
,
elementSize
*
4
,
*
buf
));
params
-=
4
;
}
if
(
params
>
1
)
{
cl
::
Buffer
*
buf
=
new
cl
::
Buffer
(
context
.
getContext
(),
CL_MEM_READ_WRITE
,
numObjects
*
sizeof
(
mm_float2
)
);
cl
::
Buffer
*
buf
=
new
cl
::
Buffer
(
context
.
getContext
(),
CL_MEM_READ_WRITE
,
numObjects
*
elementSize
*
2
);
std
::
stringstream
name
;
name
<<
"param"
<<
(
++
bufferCount
);
buffers
.
push_back
(
OpenCLNonbondedUtilities
::
ParameterInfo
(
name
.
str
(),
"float"
,
2
,
sizeof
(
mm_float2
)
,
*
buf
));
buffers
.
push_back
(
OpenCLNonbondedUtilities
::
ParameterInfo
(
name
.
str
(),
elementType
,
2
,
elementSize
*
2
,
*
buf
));
params
-=
2
;
}
}
while
(
params
>
0
)
{
cl
::
Buffer
*
buf
=
new
cl
::
Buffer
(
context
.
getContext
(),
CL_MEM_READ_WRITE
,
numObjects
*
sizeof
(
cl_float
)
);
cl
::
Buffer
*
buf
=
new
cl
::
Buffer
(
context
.
getContext
(),
CL_MEM_READ_WRITE
,
numObjects
*
elementSize
);
std
::
stringstream
name
;
name
<<
"param"
<<
(
++
bufferCount
);
buffers
.
push_back
(
OpenCLNonbondedUtilities
::
ParameterInfo
(
name
.
str
(),
"float"
,
1
,
sizeof
(
cl_float
)
,
*
buf
));
buffers
.
push_back
(
OpenCLNonbondedUtilities
::
ParameterInfo
(
name
.
str
(),
elementType
,
1
,
elementSize
,
*
buf
));
params
--
;
}
}
...
...
@@ -73,39 +75,42 @@ OpenCLParameterSet::~OpenCLParameterSet() {
delete
&
buffers
[
i
].
getMemory
();
}
void
OpenCLParameterSet
::
getParameterValues
(
vector
<
vector
<
cl_float
>
>&
values
)
const
{
template
<
class
T
>
void
OpenCLParameterSet
::
getParameterValues
(
vector
<
vector
<
T
>
>&
values
)
const
{
if
(
sizeof
(
T
)
!=
elementSize
)
throw
OpenMMException
(
"Called getParameterValues() with vector of wrong type"
);
values
.
resize
(
numObjects
);
for
(
int
i
=
0
;
i
<
numObjects
;
i
++
)
values
[
i
].
resize
(
numParameters
);
try
{
int
base
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
buffers
.
size
();
i
++
)
{
if
(
buffers
[
i
].
get
Typ
e
()
==
"float4"
)
{
vector
<
mm_float4
>
data
(
numObjects
);
if
(
buffers
[
i
].
get
Siz
e
()
==
4
*
elementSize
)
{
vector
<
T
>
data
(
4
*
numObjects
);
context
.
getQueue
().
enqueueReadBuffer
(
reinterpret_cast
<
cl
::
Buffer
&>
(
buffers
[
i
].
getMemory
()),
CL_TRUE
,
0
,
numObjects
*
buffers
[
i
].
getSize
(),
&
data
[
0
]);
for
(
int
j
=
0
;
j
<
numObjects
;
j
++
)
{
values
[
j
][
base
]
=
data
[
j
]
.
x
;
values
[
j
][
base
]
=
data
[
4
*
j
];
if
(
base
+
1
<
numParameters
)
values
[
j
][
base
+
1
]
=
data
[
j
].
y
;
values
[
j
][
base
+
1
]
=
data
[
4
*
j
+
1
]
;
if
(
base
+
2
<
numParameters
)
values
[
j
][
base
+
2
]
=
data
[
j
].
z
;
values
[
j
][
base
+
2
]
=
data
[
4
*
j
+
2
]
;
if
(
base
+
3
<
numParameters
)
values
[
j
][
base
+
3
]
=
data
[
j
].
w
;
values
[
j
][
base
+
3
]
=
data
[
4
*
j
+
3
]
;
}
base
+=
4
;
}
else
if
(
buffers
[
i
].
get
Typ
e
()
==
"float2"
)
{
vector
<
mm_float2
>
data
(
numObjects
);
else
if
(
buffers
[
i
].
get
Siz
e
()
==
2
*
elementSize
)
{
vector
<
T
>
data
(
2
*
numObjects
);
context
.
getQueue
().
enqueueReadBuffer
(
reinterpret_cast
<
cl
::
Buffer
&>
(
buffers
[
i
].
getMemory
()),
CL_TRUE
,
0
,
numObjects
*
buffers
[
i
].
getSize
(),
&
data
[
0
]);
for
(
int
j
=
0
;
j
<
numObjects
;
j
++
)
{
values
[
j
][
base
]
=
data
[
j
]
.
x
;
values
[
j
][
base
]
=
data
[
2
*
j
];
if
(
base
+
1
<
numParameters
)
values
[
j
][
base
+
1
]
=
data
[
j
].
y
;
values
[
j
][
base
+
1
]
=
data
[
2
*
j
+
1
]
;
}
base
+=
2
;
}
else
if
(
buffers
[
i
].
get
Typ
e
()
==
"float"
)
{
vector
<
cl_float
>
data
(
numObjects
);
else
if
(
buffers
[
i
].
get
Siz
e
()
==
elementSize
)
{
vector
<
T
>
data
(
numObjects
);
context
.
getQueue
().
enqueueReadBuffer
(
reinterpret_cast
<
cl
::
Buffer
&>
(
buffers
[
i
].
getMemory
()),
CL_TRUE
,
0
,
numObjects
*
buffers
[
i
].
getSize
(),
&
data
[
0
]);
for
(
int
j
=
0
;
j
<
numObjects
;
j
++
)
values
[
j
][
base
]
=
data
[
j
];
...
...
@@ -122,36 +127,39 @@ void OpenCLParameterSet::getParameterValues(vector<vector<cl_float> >& values) c
}
}
void
OpenCLParameterSet
::
setParameterValues
(
const
vector
<
vector
<
cl_float
>
>&
values
)
{
template
<
class
T
>
void
OpenCLParameterSet
::
setParameterValues
(
const
vector
<
vector
<
T
>
>&
values
)
{
if
(
sizeof
(
T
)
!=
elementSize
)
throw
OpenMMException
(
"Called setParameterValues() with vector of wrong type"
);
try
{
int
base
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
buffers
.
size
();
i
++
)
{
if
(
buffers
[
i
].
get
Typ
e
()
==
"float4"
)
{
vector
<
mm_float4
>
data
(
numObjects
);
if
(
buffers
[
i
].
get
Siz
e
()
==
4
*
elementSize
)
{
vector
<
T
>
data
(
4
*
numObjects
);
for
(
int
j
=
0
;
j
<
numObjects
;
j
++
)
{
data
[
j
]
.
x
=
values
[
j
][
base
];
data
[
4
*
j
]
=
values
[
j
][
base
];
if
(
base
+
1
<
numParameters
)
data
[
j
].
y
=
values
[
j
][
base
+
1
];
data
[
4
*
j
+
1
]
=
values
[
j
][
base
+
1
];
if
(
base
+
2
<
numParameters
)
data
[
j
].
z
=
values
[
j
][
base
+
2
];
data
[
4
*
j
+
2
]
=
values
[
j
][
base
+
2
];
if
(
base
+
3
<
numParameters
)
data
[
j
].
w
=
values
[
j
][
base
+
3
];
data
[
4
*
j
+
3
]
=
values
[
j
][
base
+
3
];
}
context
.
getQueue
().
enqueueWriteBuffer
(
reinterpret_cast
<
cl
::
Buffer
&>
(
buffers
[
i
].
getMemory
()),
CL_TRUE
,
0
,
numObjects
*
buffers
[
i
].
getSize
(),
&
data
[
0
]);
base
+=
4
;
}
else
if
(
buffers
[
i
].
get
Typ
e
()
==
"float2"
)
{
vector
<
mm_float2
>
data
(
numObjects
);
else
if
(
buffers
[
i
].
get
Siz
e
()
==
2
*
elementSize
)
{
vector
<
T
>
data
(
2
*
numObjects
);
for
(
int
j
=
0
;
j
<
numObjects
;
j
++
)
{
data
[
j
]
.
x
=
values
[
j
][
base
];
data
[
2
*
j
]
=
values
[
j
][
base
];
if
(
base
+
1
<
numParameters
)
data
[
j
].
y
=
values
[
j
][
base
+
1
];
data
[
2
*
j
+
1
]
=
values
[
j
][
base
+
1
];
}
context
.
getQueue
().
enqueueWriteBuffer
(
reinterpret_cast
<
cl
::
Buffer
&>
(
buffers
[
i
].
getMemory
()),
CL_TRUE
,
0
,
numObjects
*
buffers
[
i
].
getSize
(),
&
data
[
0
]);
base
+=
2
;
}
else
if
(
buffers
[
i
].
get
Typ
e
()
==
"float"
)
{
vector
<
cl_float
>
data
(
numObjects
);
else
if
(
buffers
[
i
].
get
Siz
e
()
==
elementSize
)
{
vector
<
T
>
data
(
numObjects
);
for
(
int
j
=
0
;
j
<
numObjects
;
j
++
)
data
[
j
]
=
values
[
j
][
base
];
context
.
getQueue
().
enqueueWriteBuffer
(
reinterpret_cast
<
cl
::
Buffer
&>
(
buffers
[
i
].
getMemory
()),
CL_TRUE
,
0
,
numObjects
*
buffers
[
i
].
getSize
(),
&
data
[
0
]);
...
...
@@ -172,16 +180,26 @@ string OpenCLParameterSet::getParameterSuffix(int index, const std::string& extr
const
string
suffixes
[]
=
{
".x"
,
".y"
,
".z"
,
".w"
};
int
buffer
=
-
1
;
for
(
int
i
=
0
;
buffer
==
-
1
&&
i
<
(
int
)
buffers
.
size
();
i
++
)
{
if
(
index
*
sizeof
(
cl_float
)
<
buffers
[
i
].
getSize
())
if
(
index
*
elementSize
<
buffers
[
i
].
getSize
())
buffer
=
i
;
else
index
-=
buffers
[
i
].
getSize
()
/
sizeof
(
cl_float
)
;
index
-=
buffers
[
i
].
getSize
()
/
elementSize
;
}
if
(
buffer
==
-
1
)
throw
OpenMMException
(
"Internal error: Illegal argument to OpenCLParameterSet::getParameterSuffix() ("
+
name
+
")"
);
stringstream
suffix
;
suffix
<<
(
buffer
+
1
)
<<
extraSuffix
;
if
(
buffers
[
buffer
].
get
Typ
e
()
!=
"float"
)
if
(
buffers
[
buffer
].
get
Siz
e
()
!=
elementSize
)
suffix
<<
suffixes
[
index
];
return
suffix
.
str
();
}
/**
* Define template instantiations for float and double versions of getParameterValues() and setParameterValues().
*/
namespace
OpenMM
{
template
void
OpenCLParameterSet
::
getParameterValues
<
float
>(
vector
<
vector
<
float
>
>&
values
)
const
;
template
void
OpenCLParameterSet
::
setParameterValues
<
float
>(
const
vector
<
vector
<
float
>
>&
values
);
template
void
OpenCLParameterSet
::
getParameterValues
<
double
>(
vector
<
vector
<
double
>
>&
values
)
const
;
template
void
OpenCLParameterSet
::
setParameterValues
<
double
>(
const
vector
<
vector
<
double
>
>&
values
);
}
\ No newline at end of file
platforms/opencl/src/OpenCLParameterSet.h
View file @
8d6a2a01
...
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors.
*
* Portions copyright (c) 2009
-2012
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -51,8 +51,9 @@ public:
* @param name the name of the parameter set
* @param bufferPerParameter if true, a separate cl::Buffer is created for each parameter. If false,
* multiple parameters may be combined into a single buffer.
* @param useDoublePrecision whether values should be stored as single or double precision
*/
OpenCLParameterSet
(
OpenCLContext
&
context
,
int
numParameters
,
int
numObjects
,
const
std
::
string
&
name
,
bool
bufferPerParameter
=
false
);
OpenCLParameterSet
(
OpenCLContext
&
context
,
int
numParameters
,
int
numObjects
,
const
std
::
string
&
name
,
bool
bufferPerParameter
=
false
,
bool
useDoublePrecision
=
false
);
~
OpenCLParameterSet
();
/**
* Get the number of parameters.
...
...
@@ -71,13 +72,15 @@ public:
*
* @param values on exit, values[i][j] contains the value of parameter j for object i
*/
void
getParameterValues
(
std
::
vector
<
std
::
vector
<
cl_float
>
>&
values
)
const
;
template
<
class
T
>
void
getParameterValues
(
std
::
vector
<
std
::
vector
<
T
>
>&
values
)
const
;
/**
* Set the values of all parameters.
*
* @param values values[i][j] contains the value of parameter j for object i
*/
void
setParameterValues
(
const
std
::
vector
<
std
::
vector
<
cl_float
>
>&
values
);
template
<
class
T
>
void
setParameterValues
(
const
std
::
vector
<
std
::
vector
<
T
>
>&
values
);
/**
* Get a set of OpenCLNonbondedUtilities::ParameterInfo objects which describe the Buffers
* containing the data.
...
...
@@ -95,8 +98,7 @@ public:
std
::
string
getParameterSuffix
(
int
index
,
const
std
::
string
&
extraSuffix
=
""
)
const
;
private:
OpenCLContext
&
context
;
int
numParameters
;
int
numObjects
;
int
numParameters
,
numObjects
,
elementSize
;
std
::
string
name
;
std
::
vector
<
OpenCLNonbondedUtilities
::
ParameterInfo
>
buffers
;
};
...
...
platforms/opencl/src/OpenCLPlatform.cpp
View file @
8d6a2a01
...
...
@@ -76,8 +76,10 @@ OpenCLPlatform::OpenCLPlatform() {
registerKernelFactory
(
RemoveCMMotionKernel
::
Name
(),
factory
);
platformProperties
.
push_back
(
OpenCLDeviceIndex
());
platformProperties
.
push_back
(
OpenCLPlatformIndex
());
platformProperties
.
push_back
(
OpenCLPrecision
());
setPropertyDefaultValue
(
OpenCLDeviceIndex
(),
""
);
setPropertyDefaultValue
(
OpenCLPlatformIndex
(),
""
);
setPropertyDefaultValue
(
OpenCLPrecision
(),
"single"
);
}
bool
OpenCLPlatform
::
supportsDoublePrecision
()
const
{
...
...
@@ -101,7 +103,9 @@ void OpenCLPlatform::contextCreated(ContextImpl& context, const map<string, stri
getPropertyDefaultValue
(
OpenCLPlatformIndex
())
:
properties
.
find
(
OpenCLPlatformIndex
())
->
second
);
const
string
&
devicePropValue
=
(
properties
.
find
(
OpenCLDeviceIndex
())
==
properties
.
end
()
?
getPropertyDefaultValue
(
OpenCLDeviceIndex
())
:
properties
.
find
(
OpenCLDeviceIndex
())
->
second
);
context
.
setPlatformData
(
new
PlatformData
(
context
.
getSystem
(),
platformPropValue
,
devicePropValue
));
string
precisionPropValue
=
(
properties
.
find
(
OpenCLPrecision
())
==
properties
.
end
()
?
getPropertyDefaultValue
(
OpenCLPrecision
())
:
properties
.
find
(
OpenCLPrecision
())
->
second
);
context
.
setPlatformData
(
new
PlatformData
(
context
.
getSystem
(),
platformPropValue
,
devicePropValue
,
precisionPropValue
));
}
void
OpenCLPlatform
::
contextDestroyed
(
ContextImpl
&
context
)
const
{
...
...
@@ -109,7 +113,8 @@ void OpenCLPlatform::contextDestroyed(ContextImpl& context) const {
delete
data
;
}
OpenCLPlatform
::
PlatformData
::
PlatformData
(
const
System
&
system
,
const
string
&
platformPropValue
,
const
string
&
deviceIndexProperty
)
:
removeCM
(
false
),
stepCount
(
0
),
computeForceCount
(
0
),
time
(
0.0
)
{
OpenCLPlatform
::
PlatformData
::
PlatformData
(
const
System
&
system
,
const
string
&
platformPropValue
,
const
string
&
deviceIndexProperty
,
const
string
&
precisionProperty
)
:
removeCM
(
false
),
stepCount
(
0
),
computeForceCount
(
0
),
time
(
0.0
)
{
int
platformIndex
=
0
;
if
(
platformPropValue
.
length
()
>
0
)
stringstream
(
platformPropValue
)
>>
platformIndex
;
...
...
@@ -124,11 +129,11 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p
if
(
devices
[
i
].
length
()
>
0
)
{
unsigned
int
deviceIndex
;
stringstream
(
devices
[
i
])
>>
deviceIndex
;
contexts
.
push_back
(
new
OpenCLContext
(
system
,
platformIndex
,
deviceIndex
,
*
this
));
contexts
.
push_back
(
new
OpenCLContext
(
system
,
platformIndex
,
deviceIndex
,
precisionProperty
,
*
this
));
}
}
if
(
contexts
.
size
()
==
0
)
contexts
.
push_back
(
new
OpenCLContext
(
system
,
platformIndex
,
-
1
,
*
this
));
contexts
.
push_back
(
new
OpenCLContext
(
system
,
platformIndex
,
-
1
,
precisionProperty
,
*
this
));
stringstream
device
;
for
(
int
i
=
0
;
i
<
(
int
)
contexts
.
size
();
i
++
)
{
if
(
i
>
0
)
...
...
@@ -137,6 +142,7 @@ OpenCLPlatform::PlatformData::PlatformData(const System& system, const string& p
}
propertyValues
[
OpenCLPlatform
::
OpenCLDeviceIndex
()]
=
device
.
str
();
propertyValues
[
OpenCLPlatform
::
OpenCLPlatformIndex
()]
=
OpenCLExpressionUtilities
::
intToString
(
platformIndex
);
propertyValues
[
OpenCLPlatform
::
OpenCLPrecision
()]
=
precisionProperty
;
contextEnergy
.
resize
(
contexts
.
size
());
}
...
...
platforms/opencl/src/kernels/andersenThermostat.cl
View file @
8d6a2a01
...
...
@@ -2,17 +2,19 @@
*
Apply
the
Andersen
thermostat
to
adjust
particle
velocities.
*/
__kernel
void
applyAndersenThermostat
(
float
collisionFrequency,
float
kT,
__global
float
4*
velm,
__global
const
float
2*
restrict
stepSize,
__global
const
float4*
restrict
random,
__kernel
void
applyAndersenThermostat
(
float
collisionFrequency,
float
kT,
__global
mixed
4*
velm,
__global
const
mixed
2*
restrict
stepSize,
__global
const
float4*
restrict
random,
unsigned
int
randomIndex,
__global
const
int*
restrict
atomGroups
)
{
float
collisionProbability
=
1.0f-exp
(
-collisionFrequency*stepSize[0].y
)
;
float
randomRange
=
erf
(
collisionProbability/sqrt
(
2.0f
))
;
for
(
int
index
=
get_global_id
(
0
)
; index < NUM_ATOMS; index += get_global_size(0)) {
float
4
velocity
=
velm[index]
;
mixed
4
velocity
=
velm[index]
;
float4
selectRand
=
random[randomIndex+atomGroups[index]]
;
float4
velRand
=
random[randomIndex+index]
;
float
scale
=
(
selectRand.w
>
-randomRange
&&
selectRand.w
<
randomRange
?
0.0f
:
1.0f
)
;
float
add
=
(
1.0f-scale
)
*sqrt
(
kT*velocity.w
)
;
velocity.xyz
=
scale*velocity.xyz
+
add*velRand.xyz
;
real
scale
=
(
selectRand.w
>
-randomRange
&&
selectRand.w
<
randomRange
?
0
:
1
)
;
real
add
=
(
1-scale
)
*sqrt
(
kT*velocity.w
)
;
velocity.x
=
scale*velocity.x
+
add*velRand.x
;
velocity.y
=
scale*velocity.y
+
add*velRand.y
;
velocity.z
=
scale*velocity.z
+
add*velRand.z
;
velm[index]
=
velocity
;
}
}
platforms/opencl/src/kernels/brownian.cl
View file @
8d6a2a01
...
...
@@ -2,13 +2,16 @@
*
Perform
the
first
step
of
Brownian
integration.
*/
__kernel
void
integrateBrownianPart1
(
float
tauDeltaT,
float
noiseAmplitude,
__global
const
float
4*
restrict
force,
__global
float
4*
restrict
posDelta,
__global
const
float
4*
restrict
velm,
__global
const
float4*
restrict
random,
unsigned
int
randomIndex
)
{
__kernel
void
integrateBrownianPart1
(
mixed
tauDeltaT,
mixed
noiseAmplitude,
__global
const
real
4*
restrict
force,
__global
mixed
4*
restrict
posDelta,
__global
const
mixed
4*
restrict
velm,
__global
const
float4*
restrict
random,
unsigned
int
randomIndex
)
{
randomIndex
+=
get_global_id
(
0
)
;
for
(
int
index
=
get_global_id
(
0
)
; index < NUM_ATOMS; index += get_global_size(0)) {
float
invMass
=
velm[index].w
;
if
(
invMass
!=
0.0
)
posDelta[index]
=
(
float4
)
(
tauDeltaT*invMass*force[index].xyz
+
noiseAmplitude*sqrt
(
invMass
)
*random[randomIndex].xyz,
0.0f
)
;
mixed
invMass
=
velm[index].w
;
if
(
invMass
!=
0
)
{
posDelta[index]
=
(
mixed4
)
(
tauDeltaT*invMass*force[index].x
+
noiseAmplitude*sqrt
(
invMass
)
*random[randomIndex].x,
tauDeltaT*invMass*force[index].y
+
noiseAmplitude*sqrt
(
invMass
)
*random[randomIndex].y,
tauDeltaT*invMass*force[index].z
+
noiseAmplitude*sqrt
(
invMass
)
*random[randomIndex].z,
0
)
;
}
randomIndex
+=
get_global_size
(
0
)
;
}
}
...
...
@@ -17,12 +20,29 @@ __kernel void integrateBrownianPart1(float tauDeltaT, float noiseAmplitude, __gl
*
Perform
the
second
step
of
Brownian
integration.
*/
__kernel
void
integrateBrownianPart2
(
float
oneOverDeltaT,
__global
float
4*
posq,
__global
float
4*
velm,
__global
const
float
4*
restrict
posDelta
)
{
__kernel
void
integrateBrownianPart2
(
mixed
oneOverDeltaT,
__global
real
4*
posq,
__global
real4*
posqCorrection,
__global
mixed
4*
velm,
__global
const
mixed
4*
restrict
posDelta
)
{
for
(
int
index
=
get_global_id
(
0
)
; index < NUM_ATOMS; index += get_global_size(0)) {
if
(
velm[index].w
!=
0.0
)
{
float4
delta
=
posDelta[index]
;
velm[index].xyz
=
oneOverDeltaT*delta.xyz
;
posq[index].xyz
=
posq[index].xyz
+
delta.xyz
;
if
(
velm[index].w
!=
0
)
{
mixed4
delta
=
posDelta[index]
;
velm[index].x
=
oneOverDeltaT*delta.x
;
velm[index].y
=
oneOverDeltaT*delta.y
;
velm[index].z
=
oneOverDeltaT*delta.z
;
#
ifdef
USE_MIXED_PRECISION
real4
pos1
=
posq[index]
;
real4
pos2
=
posqCorrection[index]
;
mixed4
pos
=
(
mixed4
)
(
pos1.x+
(
mixed
)
pos2.x,
pos1.y+
(
mixed
)
pos2.y,
pos1.z+
(
mixed
)
pos2.z,
pos1.w
)
;
#
else
real4
pos
=
posq[index]
;
#
endif
pos.x
+=
delta.x
;
pos.y
+=
delta.y
;
pos.z
+=
delta.z
;
#
ifdef
USE_MIXED_PRECISION
posq[index]
=
(
real4
)
((
real
)
pos.x,
(
real
)
pos.y,
(
real
)
pos.z,
(
real
)
pos.w
)
;
posqCorrection[index]
=
(
real4
)
(
pos.x-
(
real
)
pos.x,
pos.y-
(
real
)
pos.y,
pos.z-
(
real
)
pos.z,
0
)
;
#
else
posq[index]
=
pos
;
#
endif
}
}
}
platforms/opencl/src/kernels/ccma.cl
View file @
8d6a2a01
mixed4
loadPos
(
__global
const
real4*
restrict
posq,
__global
const
real4*
restrict
posqCorrection,
int
index
)
{
#
ifdef
USE_MIXED_PRECISION
real4
pos1
=
posq[index]
;
real4
pos2
=
posqCorrection[index]
;
return
(
mixed4
)
(
pos1.x+
(
mixed
)
pos2.x,
pos1.y+
(
mixed
)
pos2.y,
pos1.z+
(
mixed
)
pos2.z,
pos1.w
)
;
#
else
return
posq[index]
;
#
endif
}
/**
*
Compute
the
direction
each
constraint
is
pointing
in.
This
is
called
once
at
the
beginning
of
constraint
evaluation.
*/
__kernel
void
computeConstraintDirections
(
__global
const
int2*
restrict
constraintAtoms,
__global
float
4*
restrict
constraintDistance,
__global
const
float
4*
restrict
atomPositions
)
{
__kernel
void
computeConstraintDirections
(
__global
const
int2*
restrict
constraintAtoms,
__global
mixed
4*
restrict
constraintDistance,
__global
const
real
4*
restrict
atomPositions
,
__global
const
real4*
restrict
posCorrection
)
{
for
(
int
index
=
get_global_id
(
0
)
; index < NUM_CONSTRAINTS; index += get_global_size(0)) {
//
Compute
the
direction
for
this
constraint.
int2
atoms
=
constraintAtoms[index]
;
float
4
dir
=
constraintDistance[index]
;
float
4
oldPos1
=
atomPositions
[
atoms.x
]
;
float
4
oldPos2
=
atomPositions
[
atoms.y
]
;
mixed
4
dir
=
constraintDistance[index]
;
mixed
4
oldPos1
=
loadPos
(
atomPositions
,
posCorrection,
atoms.x
)
;
mixed
4
oldPos2
=
loadPos
(
atomPositions
,
posCorrection,
atoms.y
)
;
dir.x
=
oldPos1.x-oldPos2.x
;
dir.y
=
oldPos1.y-oldPos2.y
;
dir.z
=
oldPos1.z-oldPos2.z
;
...
...
@@ -19,8 +28,8 @@ __kernel void computeConstraintDirections(__global const int2* restrict constrai
/**
*
Compute
the
force
applied
by
each
constraint.
*/
__kernel
void
computeConstraintForce
(
__global
const
int2*
restrict
constraintAtoms,
__global
const
float
4*
restrict
constraintDistance,
__global
const
float
4*
restrict
atomPositions,
__global
const
float
*
restrict
reducedMass,
__global
float
*
restrict
delta1,
__global
int*
restrict
converged,
float
tol,
int
iteration
)
{
__kernel
void
computeConstraintForce
(
__global
const
int2*
restrict
constraintAtoms,
__global
const
mixed
4*
restrict
constraintDistance,
__global
const
mixed
4*
restrict
atomPositions,
__global
const
mixed
*
restrict
reducedMass,
__global
mixed
*
restrict
delta1,
__global
int*
restrict
converged,
mixed
tol,
int
iteration
)
{
__local
int
groupConverged
;
if
(
converged[1-iteration%2]
)
{
if
(
get_global_id
(
0
)
==
0
)
...
...
@@ -30,21 +39,21 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto
if
(
get_local_id
(
0
)
==
0
)
groupConverged
=
1
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
float
lowerTol
=
1
.0f-2.0f
*tol+tol*tol
;
float
upperTol
=
1
.0f+2.0f
*tol+tol*tol
;
mixed
lowerTol
=
1
-2
*tol+tol*tol
;
mixed
upperTol
=
1
+2
*tol+tol*tol
;
for
(
int
index
=
get_global_id
(
0
)
; index < NUM_CONSTRAINTS; index += get_global_size(0)) {
//
Compute
the
force
due
to
this
constraint.
int2
atoms
=
constraintAtoms[index]
;
float
4
dir
=
constraintDistance[index]
;
float
4
rp_ij
=
atomPositions[atoms.x]-atomPositions[atoms.y]
;
mixed
4
dir
=
constraintDistance[index]
;
mixed
4
rp_ij
=
atomPositions[atoms.x]-atomPositions[atoms.y]
;
#
ifndef
CONSTRAIN_VELOCITIES
rp_ij.xyz
+=
dir.xyz
;
#
endif
float
rrpr
=
rp_ij.x*dir.x
+
rp_ij.y*dir.y
+
rp_ij.z*dir.z
;
float
d_ij2
=
dir.x*dir.x
+
dir.y*dir.y
+
dir.z*dir.z
;
mixed
rrpr
=
rp_ij.x*dir.x
+
rp_ij.y*dir.y
+
rp_ij.z*dir.z
;
mixed
d_ij2
=
dir.x*dir.x
+
dir.y*dir.y
+
dir.z*dir.z
;
#
ifdef
CONSTRAIN_VELOCITIES
delta1[index]
=
-2
.0f
*reducedMass[index]*rrpr/d_ij2
;
delta1[index]
=
-2*reducedMass[index]*rrpr/d_ij2
;
//
See
whether
it
has
converged.
...
...
@@ -53,9 +62,9 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto
converged[iteration%2]
=
0
;
}
#
else
float
rp2
=
rp_ij.x*rp_ij.x
+
rp_ij.y*rp_ij.y
+
rp_ij.z*rp_ij.z
;
float
dist2
=
dir.w*dir.w
;
float
diff
=
dist2
-
rp2
;
mixed
rp2
=
rp_ij.x*rp_ij.x
+
rp_ij.y*rp_ij.y
+
rp_ij.z*rp_ij.z
;
mixed
dist2
=
dir.w*dir.w
;
mixed
diff
=
dist2
-
rp2
;
delta1[index]
=
(
rrpr
>
d_ij2*1e-6f
?
reducedMass[index]*diff/rrpr
:
0.0f
)
;
//
See
whether
it
has
converged.
...
...
@@ -71,15 +80,15 @@ __kernel void computeConstraintForce(__global const int2* restrict constraintAto
/**
*
Multiply
the
vector
of
constraint
forces
by
the
constraint
matrix.
*/
__kernel
void
multiplyByConstraintMatrix
(
__global
const
float
*
restrict
delta1,
__global
float
*
restrict
delta2,
__global
const
int*
restrict
constraintMatrixColumn,
__global
const
float
*
restrict
constraintMatrixValue,
__global
const
int*
restrict
converged,
int
iteration
)
{
__kernel
void
multiplyByConstraintMatrix
(
__global
const
mixed
*
restrict
delta1,
__global
mixed
*
restrict
delta2,
__global
const
int*
restrict
constraintMatrixColumn,
__global
const
mixed
*
restrict
constraintMatrixValue,
__global
const
int*
restrict
converged,
int
iteration
)
{
if
(
converged[iteration%2]
)
return
; // The constraint iteration has already converged.
//
Multiply
by
the
inverse
constraint
matrix.
for
(
int
index
=
get_global_id
(
0
)
; index < NUM_CONSTRAINTS; index += get_global_size(0)) {
float
sum
=
0
.0f
;
mixed
sum
=
0
;
for
(
int
i
=
0
; ; i++) {
int
element
=
index+i*NUM_CONSTRAINTS
;
int
column
=
constraintMatrixColumn[element]
;
...
...
@@ -94,26 +103,26 @@ __kernel void multiplyByConstraintMatrix(__global const float* restrict delta1,
/**
*
Update
the
atom
positions
based
on
constraint
forces.
*/
__kernel
void
updateAtomPositions
(
__global
const
int*
restrict
numAtomConstraints,
__global
const
int*
restrict
atomConstraints,
__global
const
float
4*
restrict
constraintDistance,
__global
float
4*
restrict
atomPositions,
__global
const
float
4*
restrict
velm,
__global
const
float
*
restrict
delta1,
__global
const
float
*
restrict
delta2,
__global
int*
restrict
converged,
int
iteration
)
{
__kernel
void
updateAtomPositions
(
__global
const
int*
restrict
numAtomConstraints,
__global
const
int*
restrict
atomConstraints,
__global
const
mixed
4*
restrict
constraintDistance,
__global
mixed
4*
restrict
atomPositions,
__global
const
mixed
4*
restrict
velm,
__global
const
mixed
*
restrict
delta1,
__global
const
mixed
*
restrict
delta2,
__global
int*
restrict
converged,
int
iteration
)
{
if
(
get_global_id
(
0
)
==
0
)
converged[1-iteration%2]
=
1
;
if
(
converged[iteration%2]
)
return
; // The constraint iteration has already converged.
float
damping
=
(
iteration
<
2
?
0.5f
:
1.0f
)
;
mixed
damping
=
(
iteration
<
2
?
0.5f
:
1.0f
)
;
for
(
int
index
=
get_global_id
(
0
)
; index < NUM_ATOMS; index += get_global_size(0)) {
//
Compute
the
new
position
of
this
atom.
float
4
atomPos
=
atomPositions[index]
;
float
invMass
=
velm[index].w
;
mixed
4
atomPos
=
atomPositions[index]
;
mixed
invMass
=
velm[index].w
;
int
num
=
numAtomConstraints[index]
;
for
(
int
i
=
0
; i < num; i++) {
int
constraint
=
atomConstraints[index+i*NUM_ATOMS]
;
bool
forward
=
(
constraint
>
0
)
;
constraint
=
(
forward
?
constraint-1
:
-constraint-1
)
;
float
constraintForce
=
damping*invMass*delta2[constraint]
;
mixed
constraintForce
=
damping*invMass*delta2[constraint]
;
constraintForce
=
(
forward
?
constraintForce
:
-constraintForce
)
;
float
4
dir
=
constraintDistance[constraint]
;
mixed
4
dir
=
constraintDistance[constraint]
;
atomPos.x
+=
constraintForce*dir.x
;
atomPos.y
+=
constraintForce*dir.y
;
atomPos.z
+=
constraintForce*dir.z
;
...
...
platforms/opencl/src/kernels/constraints.cl
View file @
8d6a2a01
__kernel
void
applyPositionDeltas
(
__global
float
4*
restrict
posq,
__global
float
4*
restrict
posDelta
)
{
__kernel
void
applyPositionDeltas
(
__global
real
4*
restrict
posq,
__global
real4*
restrict
posqCorrection,
__global
mixed
4*
restrict
posDelta
)
{
for
(
unsigned
int
index
=
get_global_id
(
0
)
; index < NUM_ATOMS; index += get_global_size(0)) {
float4
position
=
posq[index]
;
position.xyz
+=
posDelta[index].xyz
;
posq[index]
=
position
;
#
ifdef
USE_MIXED_PRECISION
real4
pos1
=
posq[index]
;
real4
pos2
=
posqCorrection[index]
;
mixed4
pos
=
(
mixed4
)
(
pos1.x+
(
mixed
)
pos2.x,
pos1.y+
(
mixed
)
pos2.y,
pos1.z+
(
mixed
)
pos2.z,
pos1.w
)
;
#
else
mixed4
pos
=
posq[index]
;
#
endif
pos.xyz
+=
posDelta[index].xyz
;
#
ifdef
USE_MIXED_PRECISION
posq[index]
=
(
real4
)
((
real
)
pos.x,
(
real
)
pos.y,
(
real
)
pos.z,
(
real
)
pos.w
)
;
posqCorrection[index]
=
(
real4
)
(
pos.x-
(
real
)
pos.x,
pos.y-
(
real
)
pos.y,
pos.z-
(
real
)
pos.z,
0
)
;
#
else
posq[index]
=
pos
;
#
endif
}
}
platforms/opencl/src/kernels/customIntegrator.cl
View file @
8d6a2a01
__kernel
void
computeSum
(
__global
const
float*
restrict
sumBuffer,
__global
float*
result,
unsigned
int
outputIndex,
int
bufferSize
)
{
__kernel
void
compute
Float
Sum
(
__global
const
float*
restrict
sumBuffer,
__global
float*
result,
unsigned
int
outputIndex,
int
bufferSize
)
{
__local
float
tempBuffer[WORK_GROUP_SIZE]
;
const
unsigned
int
thread
=
get_local_id
(
0
)
;
float
sum
=
0
.0f
;
float
sum
=
0
;
for
(
unsigned
int
index
=
thread
; index < bufferSize; index += get_local_size(0))
sum
+=
sumBuffer[index]
;
tempBuffer[thread]
=
sum
;
...
...
@@ -14,12 +14,41 @@ __kernel void computeSum(__global const float* restrict sumBuffer, __global floa
result[outputIndex]
=
tempBuffer[0]
;
}
__kernel
void
applyPositionDeltas
(
__global
float4*
restrict
posq,
__global
float4*
restrict
posDelta
)
{
#
ifdef
SUPPORTS_DOUBLE_PRECISION
__kernel
void
computeDoubleSum
(
__global
const
double*
restrict
sumBuffer,
__global
double*
result,
unsigned
int
outputIndex,
int
bufferSize
)
{
__local
double
tempBuffer[WORK_GROUP_SIZE]
;
const
unsigned
int
thread
=
get_local_id
(
0
)
;
double
sum
=
0
;
for
(
unsigned
int
index
=
thread
; index < bufferSize; index += get_local_size(0))
sum
+=
sumBuffer[index]
;
tempBuffer[thread]
=
sum
;
for
(
int
i
=
1
; i < WORK_GROUP_SIZE; i *= 2) {
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
thread%
(
i*2
)
==
0
&&
thread+i
<
WORK_GROUP_SIZE
)
tempBuffer[thread]
+=
tempBuffer[thread+i]
;
}
if
(
thread
==
0
)
result[outputIndex]
=
tempBuffer[0]
;
}
#
endif
__kernel
void
applyPositionDeltas
(
__global
real4*
restrict
posq,
__global
real4*
restrict
posqCorrection,
__global
mixed4*
restrict
posDelta
)
{
for
(
unsigned
int
index
=
get_global_id
(
0
)
; index < NUM_ATOMS; index += get_global_size(0)) {
float4
position
=
posq[index]
;
position.xyz
+=
posDelta[index].xyz
;
posq[index]
=
position
;
posDelta[index]
=
(
float4
)
0.0f
;
#
ifdef
USE_MIXED_PRECISION
real4
pos1
=
posq[index]
;
real4
pos2
=
posqCorrection[index]
;
mixed4
pos
=
(
mixed4
)
(
pos1.x+
(
mixed
)
pos2.x,
pos1.y+
(
mixed
)
pos2.y,
pos1.z+
(
mixed
)
pos2.z,
pos1.w
)
;
#
else
real4
pos
=
posq[index]
;
#
endif
pos.xyz
+=
posDelta[index].xyz
;
#
ifdef
USE_MIXED_PRECISION
posq[index]
=
(
real4
)
((
real
)
pos.x,
(
real
)
pos.y,
(
real
)
pos.z,
(
real
)
pos.w
)
;
posqCorrection[index]
=
(
real4
)
(
pos.x-
(
real
)
pos.x,
pos.y-
(
real
)
pos.y,
pos.z-
(
real
)
pos.z,
0
)
;
#
else
posq[index]
=
pos
;
#
endif
posDelta[index]
=
(
mixed4
)
0
;
}
}
...
...
platforms/opencl/src/kernels/customIntegratorGlobal.cl
View file @
8d6a2a01
__kernel
void
computeGlobal
(
__global
float
2*
restrict
dt,
__global
float
*
restrict
globals,
__global
float
*
restrict
params,
float
uniform,
float
gaussian,
__global
const
float
*
restrict
energy
)
{
__kernel
void
computeGlobal
(
__global
mixed
2*
restrict
dt,
__global
mixed
*
restrict
globals,
__global
mixed
*
restrict
params,
float
uniform,
float
gaussian,
__global
const
real
*
restrict
energy
)
{
COMPUTE_STEP
}
platforms/opencl/src/kernels/customIntegratorPerDof.cl
View file @
8d6a2a01
#
ifdef
SUPPORTS_DOUBLE_PRECISION
#
pragma
OPENCL
EXTENSION
cl_khr_fp64
:
enable
/**
*
Load
the
position
of
a
particle.
*/
mixed4
loadPos
(
__global
const
real4*
restrict
posq,
__global
const
real4*
restrict
posqCorrection,
int
index
)
{
#
ifdef
USE_MIXED_PRECISION
real4
pos1
=
posq[index]
;
real4
pos2
=
posqCorrection[index]
;
return
(
mixed4
)
(
pos1.x+
(
mixed
)
pos2.x,
pos1.y+
(
mixed
)
pos2.y,
pos1.z+
(
mixed
)
pos2.z,
pos1.w
)
;
#
else
return
posq[index]
;
#
endif
}
/**
*
Store
the
position
of
a
particle.
*/
void
storePos
(
__global
real4*
restrict
posq,
__global
real4*
restrict
posqCorrection,
int
index,
mixed4
pos
)
{
#
ifdef
USE_MIXED_PRECISION
posq[index]
=
(
real4
)
((
real
)
pos.x,
(
real
)
pos.y,
(
real
)
pos.z,
(
real
)
pos.w
)
;
posqCorrection[index]
=
(
real4
)
(
pos.x-
(
real
)
pos.x,
pos.y-
(
real
)
pos.y,
pos.z-
(
real
)
pos.z,
0
)
;
#
else
posq[index]
=
pos
;
#
endif
}
__kernel
void
computePerDof
(
__global
float
4*
restrict
posq,
__global
float
4*
restrict
pos
Delta
,
__global
float
4*
restrict
velm
,
__global
const
float
4*
restrict
force,
__global
const
float
2*
restrict
dt,
__global
const
float
*
restrict
globals,
__global
const
float
*
restrict
params,
__global
float
*
restrict
sum,
__global
const
float4*
restrict
gaussianValues,
unsigned
int
randomIndex,
__global
const
float4*
restrict
uniformValues,
__global
const
float
*
restrict
energy
__kernel
void
computePerDof
(
__global
real
4*
restrict
posq,
__global
real
4*
restrict
pos
qCorrection
,
__global
mixed
4*
restrict
posDelta
,
__global
mixed4*
restrict
velm,
__global
const
real
4*
restrict
force,
__global
const
mixed
2*
restrict
dt,
__global
const
mixed
*
restrict
globals,
__global
const
mixed
*
restrict
params,
__global
mixed
*
restrict
sum,
__global
const
float4*
restrict
gaussianValues,
unsigned
int
randomIndex,
__global
const
float4*
restrict
uniformValues,
__global
const
real
*
restrict
energy
PARAMETER_ARGUMENTS
)
{
float
stepSize
=
dt[0].y
;
mixed
stepSize
=
dt[0].y
;
int
index
=
get_global_id
(
0
)
;
randomIndex
+=
index
;
while
(
index
<
NUM_ATOMS
)
{
#
ifdef
SUPPORTS_DOUBLE_PRECISION
#
ifdef
LOAD_POS_AS_DELTA
double
4
position
=
convert_double4
(
posq[
index
]
+posDelta[index]
)
;
mixed
4
position
=
loadPos
(
posq,
posqCorrection,
index
)
+posDelta[index]
;
#
else
double4
position
=
convert_double4
(
posq[index]
)
;
#
endif
double4
velocity
=
convert_double4
(
velm[index]
)
;
double4
f
=
convert_double4
(
force[index]
)
;
double
mass
=
1.0/velocity.w
;
#
else
#
ifdef
LOAD_POS_AS_DELTA
float4
position
=
posq[index]+posDelta[index]
;
#
else
float4
position
=
posq[index]
;
#
endif
float4
velocity
=
velm[index]
;
float4
f
=
force[index]
;
float
mass
=
1.0f/velocity.w
;
mixed4
position
=
loadPos
(
posq,
posqCorrection,
index
)
;
#
endif
mixed4
velocity
=
velm[index]
;
real4
f
=
force[index]
;
mixed
mass
=
1/velocity.w
;
if
(
velocity.w
!=
0.0
)
{
float4
gaussian
=
gaussianValues[randomIndex]
;
float4
uniform
=
uniformValues[index]
;
...
...
platforms/opencl/src/kernels/ewald.cl
View file @
8d6a2a01
float2
multofFloat2
(
float2
a,
float2
b
)
{
return
(
float2
)
(
a.x*b.x
-
a.y*b.y,
a.x*b.y
+
a.y*b.x
)
;
}
...
...
platforms/opencl/src/kernels/langevin.cl
View file @
8d6a2a01
#
ifdef
SUPPORTS_DOUBLE_PRECISION
#
pragma
OPENCL
EXTENSION
cl_khr_fp64
:
enable
#
endif
enum
{VelScale,
ForceScale,
NoiseScale,
MaxParams}
;
/**
*
Perform
the
first
step
of
Langevin
integration.
*/
__kernel
void
integrateLangevinPart1
(
__global
float
4*
restrict
velm,
__global
const
float
4*
restrict
force,
__global
float
4*
restrict
posDelta,
__global
const
float
*
restrict
paramBuffer,
__global
const
float
2*
restrict
dt,
__global
const
float4*
restrict
random,
unsigned
int
randomIndex
)
{
float
vscale
=
paramBuffer[VelScale]
;
float
fscale
=
paramBuffer[ForceScale]
;
float
noisescale
=
paramBuffer[NoiseScale]
;
float
stepSize
=
dt[0].y
;
__kernel
void
integrateLangevinPart1
(
__global
mixed
4*
restrict
velm,
__global
const
real
4*
restrict
force,
__global
mixed
4*
restrict
posDelta,
__global
const
mixed
*
restrict
paramBuffer,
__global
const
mixed
2*
restrict
dt,
__global
const
float4*
restrict
random,
unsigned
int
randomIndex
)
{
mixed
vscale
=
paramBuffer[VelScale]
;
mixed
fscale
=
paramBuffer[ForceScale]
;
mixed
noisescale
=
paramBuffer[NoiseScale]
;
mixed
stepSize
=
dt[0].y
;
int
index
=
get_global_id
(
0
)
;
randomIndex
+=
index
;
while
(
index
<
NUM_ATOMS
)
{
float
4
velocity
=
velm[index]
;
mixed
4
velocity
=
velm[index]
;
if
(
velocity.w
!=
0.0
)
{
float
sqrtInvMass
=
sqrt
(
velocity.w
)
;
velocity.xyz
=
vscale*velocity.xyz
+
fscale*velocity.w*force[index].xyz
+
noisescale*sqrtInvMass*random[randomIndex].xyz
;
mixed
sqrtInvMass
=
sqrt
(
velocity.w
)
;
velocity.x
=
vscale*velocity.x
+
fscale*velocity.w*force[index].x
+
noisescale*sqrtInvMass*random[randomIndex].x
;
velocity.y
=
vscale*velocity.y
+
fscale*velocity.w*force[index].y
+
noisescale*sqrtInvMass*random[randomIndex].y
;
velocity.z
=
vscale*velocity.z
+
fscale*velocity.w*force[index].z
+
noisescale*sqrtInvMass*random[randomIndex].z
;
velm[index]
=
velocity
;
posDelta[index]
=
stepSize*velocity
;
}
...
...
@@ -33,7 +31,7 @@ __kernel void integrateLangevinPart1(__global float4* restrict velm, __global co
*
Perform
the
second
step
of
Langevin
integration.
*/
__kernel
void
integrateLangevinPart2
(
__global
float
4*
restrict
posq,
__global
const
float
4*
restrict
posDelta,
__global
float
4*
restrict
velm,
__global
const
float
2*
restrict
dt
)
{
__kernel
void
integrateLangevinPart2
(
__global
real
4*
restrict
posq,
__global
real4*
restrict
posqCorrection,
__global
const
mixed
4*
restrict
posDelta,
__global
mixed
4*
restrict
velm,
__global
const
mixed
2*
restrict
dt
)
{
#
ifdef
SUPPORTS_DOUBLE_PRECISION
double
invStepSize
=
1.0/dt[0].y
;
#
else
...
...
@@ -41,17 +39,28 @@ __kernel void integrateLangevinPart2(__global float4* restrict posq, __global co
#
endif
int
index
=
get_global_id
(
0
)
;
while
(
index
<
NUM_ATOMS
)
{
float
4
vel
=
velm[index]
;
mixed
4
vel
=
velm[index]
;
if
(
vel.w
!=
0.0
)
{
float4
pos
=
posq[index]
;
float4
delta
=
posDelta[index]
;
#
ifdef
USE_MIXED_PRECISION
real4
pos1
=
posq[index]
;
real4
pos2
=
posqCorrection[index]
;
mixed4
pos
=
(
mixed4
)
(
pos1.x+
(
mixed
)
pos2.x,
pos1.y+
(
mixed
)
pos2.y,
pos1.z+
(
mixed
)
pos2.z,
pos1.w
)
;
#
else
real4
pos
=
posq[index]
;
#
endif
mixed4
delta
=
posDelta[index]
;
pos.xyz
+=
delta.xyz
;
#
ifdef
SUPPORTS_DOUBLE_PRECISION
vel.xyz
=
convert_
float
4
(
invStepSize*convert_double4
(
delta
))
.
xyz
;
vel.xyz
=
convert_
mixed
4
(
invStepSize*convert_double4
(
delta
))
.
xyz
;
#
else
vel.xyz
=
invStepSize*delta.xyz
;
#
endif
#
ifdef
USE_MIXED_PRECISION
posq[index]
=
convert_real4
(
pos
)
;
posqCorrection[index]
=
(
real4
)
(
pos.x-
(
real
)
pos.x,
pos.y-
(
real
)
pos.y,
pos.z-
(
real
)
pos.z,
0
)
;
#
else
posq[index]
=
pos
;
#
endif
velm[index]
=
vel
;
}
index
+=
get_global_size
(
0
)
;
...
...
@@ -62,15 +71,15 @@ __kernel void integrateLangevinPart2(__global float4* restrict posq, __global co
*
Select
the
step
size
to
use
for
the
next
step.
*/
__kernel
void
selectLangevinStepSize
(
float
maxStepSize,
float
errorTol,
float
tau,
float
kT,
__global
float
2*
restrict
dt,
__global
const
float
4*
restrict
velm,
__global
const
float
4*
restrict
force,
__global
float
*
restrict
paramBuffer,
__local
float
*
restrict
params,
__local
float
*
restrict
error
)
{
__kernel
void
selectLangevinStepSize
(
mixed
maxStepSize,
mixed
errorTol,
mixed
tau,
mixed
kT,
__global
mixed
2*
restrict
dt,
__global
const
mixed
4*
restrict
velm,
__global
const
real
4*
restrict
force,
__global
mixed
*
restrict
paramBuffer,
__local
mixed
*
restrict
params,
__local
mixed
*
restrict
error
)
{
//
Calculate
the
error.
float
err
=
0.0f
;
mixed
err
=
0.0f
;
unsigned
int
index
=
get_local_id
(
0
)
;
while
(
index
<
NUM_ATOMS
)
{
float
4
f
=
force[index]
;
float
invMass
=
velm[index].w
;
real
4
f
=
force[index]
;
mixed
invMass
=
velm[index].w
;
err
+=
(
f.x*f.x
+
f.y*f.y
+
f.z*f.z
)
*invMass
;
index
+=
get_global_size
(
0
)
;
}
...
...
@@ -87,9 +96,9 @@ __kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float ta
if
(
get_global_id
(
0
)
==
0
)
{
//
Select
the
new
step
size.
float
totalError
=
sqrt
(
error[0]/
(
NUM_ATOMS*3
))
;
float
newStepSize
=
sqrt
(
errorTol/totalError
)
;
float
oldStepSize
=
dt[0].y
;
mixed
totalError
=
sqrt
(
error[0]/
(
NUM_ATOMS*3
))
;
mixed
newStepSize
=
sqrt
(
errorTol/totalError
)
;
mixed
oldStepSize
=
dt[0].y
;
if
(
oldStepSize
>
0.0f
)
newStepSize
=
min
(
newStepSize,
oldStepSize*2.0f
)
; // For safety, limit how quickly dt can increase.
if
(
newStepSize
>
oldStepSize
&&
newStepSize
<
1.1f*oldStepSize
)
...
...
@@ -100,9 +109,9 @@ __kernel void selectLangevinStepSize(float maxStepSize, float errorTol, float ta
//
Recalculate
the
integration
parameters.
float
vscale
=
exp
(
-newStepSize/tau
)
;
float
fscale
=
(
1-vscale
)
*tau
;
float
noisescale
=
sqrt
(
2*kT/tau
)
*sqrt
(
0.5f*
(
1-vscale*vscale
)
*tau
)
;
mixed
vscale
=
exp
(
-newStepSize/tau
)
;
mixed
fscale
=
(
1-vscale
)
*tau
;
mixed
noisescale
=
sqrt
(
2*kT/tau
)
*sqrt
(
0.5f*
(
1-vscale*vscale
)
*tau
)
;
params[VelScale]
=
vscale
;
params[ForceScale]
=
fscale
;
params[NoiseScale]
=
noisescale
;
...
...
platforms/opencl/src/kernels/removeCM.cl
View file @
8d6a2a01
...
...
@@ -2,13 +2,16 @@
*
Calculate
the
center
of
mass
momentum.
*/
__kernel
void
calcCenterOfMassMomentum
(
int
numAtoms,
__global
const
float
4*
restrict
velm,
__global
float4*
restrict
cmMomentum,
__local
volatile
float4*
restrict
temp
)
{
__kernel
void
calcCenterOfMassMomentum
(
int
numAtoms,
__global
const
mixed
4*
restrict
velm,
__global
float4*
restrict
cmMomentum,
__local
volatile
float4*
restrict
temp
)
{
int
index
=
get_global_id
(
0
)
;
float4
cm
=
0.0f
;
while
(
index
<
numAtoms
)
{
float4
velocity
=
velm[index]
;
if
(
velocity.w
!=
0.0
)
cm.xyz
+=
velocity.xyz/velocity.w
;
mixed4
velocity
=
velm[index]
;
if
(
velocity.w
!=
0
)
{
cm.x
+=
velocity.x/velocity.w
;
cm.y
+=
velocity.y/velocity.w
;
cm.z
+=
velocity.z/velocity.w
;
}
index
+=
get_global_size
(
0
)
;
}
...
...
@@ -54,7 +57,7 @@ __kernel void calcCenterOfMassMomentum(int numAtoms, __global const float4* rest
*
Remove
center
of
mass
motion.
*/
__kernel
void
removeCenterOfMassMomentum
(
unsigned
int
numAtoms,
__global
float
4*
restrict
velm,
__global
const
float4*
restrict
cmMomentum,
__local
volatile
float4*
restrict
temp
)
{
__kernel
void
removeCenterOfMassMomentum
(
unsigned
int
numAtoms,
__global
mixed
4*
restrict
velm,
__global
const
float4*
restrict
cmMomentum,
__local
volatile
float4*
restrict
temp
)
{
//
First
sum
all
of
the
momenta
that
were
calculated
by
individual
groups.
unsigned
int
index
=
get_local_id
(
0
)
;
...
...
@@ -101,7 +104,9 @@ __kernel void removeCenterOfMassMomentum(unsigned int numAtoms, __global float4*
index
=
get_global_id
(
0
)
;
while
(
index
<
numAtoms
)
{
velm[index].xyz
-=
cm.xyz
;
velm[index].x
-=
cm.x
;
velm[index].y
-=
cm.y
;
velm[index].z
-=
cm.z
;
index
+=
get_global_size
(
0
)
;
}
}
platforms/opencl/src/kernels/settle.cl
View file @
8d6a2a01
mixed4
loadPos
(
__global
const
real4*
restrict
posq,
__global
const
real4*
restrict
posqCorrection,
int
index
)
{
#
ifdef
USE_MIXED_PRECISION
real4
pos1
=
posq[index]
;
real4
pos2
=
posqCorrection[index]
;
return
(
mixed4
)
(
pos1.x+
(
mixed
)
pos2.x,
pos1.y+
(
mixed
)
pos2.y,
pos1.z+
(
mixed
)
pos2.z,
pos1.w
)
;
#
else
return
posq[index]
;
#
endif
}
/**
*
Enforce
constraints
on
SETTLE
clusters
*/
__kernel
void
applySettle
(
int
numClusters,
float
tol,
__global
const
float
4*
restrict
oldPos,
__global
float
4*
restrict
posDelta,
__global
const
float
4*
restrict
velm,
__global
const
int4*
restrict
clusterAtoms,
__global
const
float2*
restrict
clusterParams
)
{
__kernel
void
applySettle
(
int
numClusters,
mixed
tol,
__global
const
real
4*
restrict
oldPos,
__global
const
real4*
restrict
posCorrection,
__global
mixed
4*
restrict
posDelta,
__global
const
mixed
4*
restrict
velm,
__global
const
int4*
restrict
clusterAtoms,
__global
const
float2*
restrict
clusterParams
)
{
int
index
=
get_global_id
(
0
)
;
while
(
index
<
numClusters
)
{
//
Load
the
data
for
this
cluster.
int4
atoms
=
clusterAtoms[index]
;
float2
params
=
clusterParams[index]
;
float
4
apos0
=
oldPos[
atoms.x
]
;
float
4
xp0
=
posDelta[atoms.x]
;
float
4
apos1
=
oldPos[
atoms.y
]
;
float
4
xp1
=
posDelta[atoms.y]
;
float
4
apos2
=
oldPos[
atoms.z
]
;
float
4
xp2
=
posDelta[atoms.z]
;
float
m0
=
RECIP
(
velm[atoms.x].w
)
;
float
m1
=
RECIP
(
velm[atoms.y].w
)
;
float
m2
=
RECIP
(
velm[atoms.z].w
)
;
mixed
4
apos0
=
loadPos
(
oldPos,
posCorrection,
atoms.x
)
;
mixed
4
xp0
=
posDelta[atoms.x]
;
mixed
4
apos1
=
loadPos
(
oldPos,
posCorrection,
atoms.y
)
;
mixed
4
xp1
=
posDelta[atoms.y]
;
mixed
4
apos2
=
loadPos
(
oldPos,
posCorrection,
atoms.z
)
;
mixed
4
xp2
=
posDelta[atoms.z]
;
mixed
m0
=
1/
velm[atoms.x].w
;
mixed
m1
=
1/
velm[atoms.y].w
;
mixed
m2
=
1/
velm[atoms.z].w
;
//
Apply
the
SETTLE
algorithm.
float
xb0
=
apos1.x-apos0.x
;
float
yb0
=
apos1.y-apos0.y
;
float
zb0
=
apos1.z-apos0.z
;
float
xc0
=
apos2.x-apos0.x
;
float
yc0
=
apos2.y-apos0.y
;
float
zc0
=
apos2.z-apos0.z
;
float
invTotalMass
=
1.0f/
(
m0+m1+m2
)
;
float
xcom
=
(
xp0.x*m0
+
(
xb0+xp1.x
)
*m1
+
(
xc0+xp2.x
)
*m2
)
*
invTotalMass
;
float
ycom
=
(
xp0.y*m0
+
(
yb0+xp1.y
)
*m1
+
(
yc0+xp2.y
)
*m2
)
*
invTotalMass
;
float
zcom
=
(
xp0.z*m0
+
(
zb0+xp1.z
)
*m1
+
(
zc0+xp2.z
)
*m2
)
*
invTotalMass
;
float
xa1
=
xp0.x
-
xcom
;
float
ya1
=
xp0.y
-
ycom
;
float
za1
=
xp0.z
-
zcom
;
float
xb1
=
xb0
+
xp1.x
-
xcom
;
float
yb1
=
yb0
+
xp1.y
-
ycom
;
float
zb1
=
zb0
+
xp1.z
-
zcom
;
float
xc1
=
xc0
+
xp2.x
-
xcom
;
float
yc1
=
yc0
+
xp2.y
-
ycom
;
float
zc1
=
zc0
+
xp2.z
-
zcom
;
float
xaksZd
=
yb0*zc0
-
zb0*yc0
;
float
yaksZd
=
zb0*xc0
-
xb0*zc0
;
float
zaksZd
=
xb0*yc0
-
yb0*xc0
;
float
xaksXd
=
ya1*zaksZd
-
za1*yaksZd
;
float
yaksXd
=
za1*xaksZd
-
xa1*zaksZd
;
float
zaksXd
=
xa1*yaksZd
-
ya1*xaksZd
;
float
xaksYd
=
yaksZd*zaksXd
-
zaksZd*yaksXd
;
float
yaksYd
=
zaksZd*xaksXd
-
xaksZd*zaksXd
;
float
zaksYd
=
xaksZd*yaksXd
-
yaksZd*xaksXd
;
float
axlng
=
sqrt
(
xaksXd*xaksXd
+
yaksXd*yaksXd
+
zaksXd*zaksXd
)
;
float
aylng
=
sqrt
(
xaksYd*xaksYd
+
yaksYd*yaksYd
+
zaksYd*zaksYd
)
;
float
azlng
=
sqrt
(
xaksZd*xaksZd
+
yaksZd*yaksZd
+
zaksZd*zaksZd
)
;
float
trns11
=
xaksXd
/
axlng
;
float
trns21
=
yaksXd
/
axlng
;
float
trns31
=
zaksXd
/
axlng
;
float
trns12
=
xaksYd
/
aylng
;
float
trns22
=
yaksYd
/
aylng
;
float
trns32
=
zaksYd
/
aylng
;
float
trns13
=
xaksZd
/
azlng
;
float
trns23
=
yaksZd
/
azlng
;
float
trns33
=
zaksZd
/
azlng
;
float
xb0d
=
trns11*xb0
+
trns21*yb0
+
trns31*zb0
;
float
yb0d
=
trns12*xb0
+
trns22*yb0
+
trns32*zb0
;
float
xc0d
=
trns11*xc0
+
trns21*yc0
+
trns31*zc0
;
float
yc0d
=
trns12*xc0
+
trns22*yc0
+
trns32*zc0
;
float
za1d
=
trns13*xa1
+
trns23*ya1
+
trns33*za1
;
float
xb1d
=
trns11*xb1
+
trns21*yb1
+
trns31*zb1
;
float
yb1d
=
trns12*xb1
+
trns22*yb1
+
trns32*zb1
;
float
zb1d
=
trns13*xb1
+
trns23*yb1
+
trns33*zb1
;
float
xc1d
=
trns11*xc1
+
trns21*yc1
+
trns31*zc1
;
float
yc1d
=
trns12*xc1
+
trns22*yc1
+
trns32*zc1
;
float
zc1d
=
trns13*xc1
+
trns23*yc1
+
trns33*zc1
;
mixed
xb0
=
apos1.x-apos0.x
;
mixed
yb0
=
apos1.y-apos0.y
;
mixed
zb0
=
apos1.z-apos0.z
;
mixed
xc0
=
apos2.x-apos0.x
;
mixed
yc0
=
apos2.y-apos0.y
;
mixed
zc0
=
apos2.z-apos0.z
;
mixed
invTotalMass
=
1.0f/
(
m0+m1+m2
)
;
mixed
xcom
=
(
xp0.x*m0
+
(
xb0+xp1.x
)
*m1
+
(
xc0+xp2.x
)
*m2
)
*
invTotalMass
;
mixed
ycom
=
(
xp0.y*m0
+
(
yb0+xp1.y
)
*m1
+
(
yc0+xp2.y
)
*m2
)
*
invTotalMass
;
mixed
zcom
=
(
xp0.z*m0
+
(
zb0+xp1.z
)
*m1
+
(
zc0+xp2.z
)
*m2
)
*
invTotalMass
;
mixed
xa1
=
xp0.x
-
xcom
;
mixed
ya1
=
xp0.y
-
ycom
;
mixed
za1
=
xp0.z
-
zcom
;
mixed
xb1
=
xb0
+
xp1.x
-
xcom
;
mixed
yb1
=
yb0
+
xp1.y
-
ycom
;
mixed
zb1
=
zb0
+
xp1.z
-
zcom
;
mixed
xc1
=
xc0
+
xp2.x
-
xcom
;
mixed
yc1
=
yc0
+
xp2.y
-
ycom
;
mixed
zc1
=
zc0
+
xp2.z
-
zcom
;
mixed
xaksZd
=
yb0*zc0
-
zb0*yc0
;
mixed
yaksZd
=
zb0*xc0
-
xb0*zc0
;
mixed
zaksZd
=
xb0*yc0
-
yb0*xc0
;
mixed
xaksXd
=
ya1*zaksZd
-
za1*yaksZd
;
mixed
yaksXd
=
za1*xaksZd
-
xa1*zaksZd
;
mixed
zaksXd
=
xa1*yaksZd
-
ya1*xaksZd
;
mixed
xaksYd
=
yaksZd*zaksXd
-
zaksZd*yaksXd
;
mixed
yaksYd
=
zaksZd*xaksXd
-
xaksZd*zaksXd
;
mixed
zaksYd
=
xaksZd*yaksXd
-
yaksZd*xaksXd
;
mixed
axlng
=
sqrt
(
xaksXd*xaksXd
+
yaksXd*yaksXd
+
zaksXd*zaksXd
)
;
mixed
aylng
=
sqrt
(
xaksYd*xaksYd
+
yaksYd*yaksYd
+
zaksYd*zaksYd
)
;
mixed
azlng
=
sqrt
(
xaksZd*xaksZd
+
yaksZd*yaksZd
+
zaksZd*zaksZd
)
;
mixed
trns11
=
xaksXd
/
axlng
;
mixed
trns21
=
yaksXd
/
axlng
;
mixed
trns31
=
zaksXd
/
axlng
;
mixed
trns12
=
xaksYd
/
aylng
;
mixed
trns22
=
yaksYd
/
aylng
;
mixed
trns32
=
zaksYd
/
aylng
;
mixed
trns13
=
xaksZd
/
azlng
;
mixed
trns23
=
yaksZd
/
azlng
;
mixed
trns33
=
zaksZd
/
azlng
;
mixed
xb0d
=
trns11*xb0
+
trns21*yb0
+
trns31*zb0
;
mixed
yb0d
=
trns12*xb0
+
trns22*yb0
+
trns32*zb0
;
mixed
xc0d
=
trns11*xc0
+
trns21*yc0
+
trns31*zc0
;
mixed
yc0d
=
trns12*xc0
+
trns22*yc0
+
trns32*zc0
;
mixed
za1d
=
trns13*xa1
+
trns23*ya1
+
trns33*za1
;
mixed
xb1d
=
trns11*xb1
+
trns21*yb1
+
trns31*zb1
;
mixed
yb1d
=
trns12*xb1
+
trns22*yb1
+
trns32*zb1
;
mixed
zb1d
=
trns13*xb1
+
trns23*yb1
+
trns33*zb1
;
mixed
xc1d
=
trns11*xc1
+
trns21*yc1
+
trns31*zc1
;
mixed
yc1d
=
trns12*xc1
+
trns22*yc1
+
trns32*zc1
;
mixed
zc1d
=
trns13*xc1
+
trns23*yc1
+
trns33*zc1
;
//
---
Step2
A2
'
---
float
rc
=
0.5*params.y
;
float
rb
=
sqrt
(
params.x*params.x-rc*rc
)
;
float
ra
=
rb*
(
m1+m2
)
*invTotalMass
;
mixed
rb
=
sqrt
(
params.x*params.x-rc*rc
)
;
mixed
ra
=
rb*
(
m1+m2
)
*invTotalMass
;
rb
-=
ra
;
float
sinphi
=
za1d
/
ra
;
float
cosphi
=
sqrt
(
1.0f
-
sinphi*sinphi
)
;
float
sinpsi
=
(
zb1d
-
zc1d
)
/
(
2*rc*cosphi
)
;
float
cospsi
=
sqrt
(
1.0f
-
sinpsi*sinpsi
)
;
float
ya2d
=
ra*cosphi
;
float
xb2d
=
-
rc*cospsi
;
float
yb2d
=
-
rb*cosphi
-
rc*sinpsi*sinphi
;
float
yc2d
=
-
rb*cosphi
+
rc*sinpsi*sinphi
;
float
xb2d2
=
xb2d*xb2d
;
float
hh2
=
4.0f*xb2d2
+
(
yb2d-yc2d
)
*
(
yb2d-yc2d
)
+
(
zb1d-zc1d
)
*
(
zb1d-zc1d
)
;
float
deltx
=
2.0f*xb2d
+
sqrt
(
4.0f*xb2d2
-
hh2
+
params.y*params.y
)
;
mixed
sinphi
=
za1d
/
ra
;
mixed
cosphi
=
sqrt
(
1.0f
-
sinphi*sinphi
)
;
mixed
sinpsi
=
(
zb1d
-
zc1d
)
/
(
2*rc*cosphi
)
;
mixed
cospsi
=
sqrt
(
1.0f
-
sinpsi*sinpsi
)
;
mixed
ya2d
=
ra*cosphi
;
mixed
xb2d
=
-
rc*cospsi
;
mixed
yb2d
=
-
rb*cosphi
-
rc*sinpsi*sinphi
;
mixed
yc2d
=
-
rb*cosphi
+
rc*sinpsi*sinphi
;
mixed
xb2d2
=
xb2d*xb2d
;
mixed
hh2
=
4.0f*xb2d2
+
(
yb2d-yc2d
)
*
(
yb2d-yc2d
)
+
(
zb1d-zc1d
)
*
(
zb1d-zc1d
)
;
mixed
deltx
=
2.0f*xb2d
+
sqrt
(
4.0f*xb2d2
-
hh2
+
params.y*params.y
)
;
xb2d
-=
deltx*0.5
;
//
---
Step3
al,be,ga
---
float
alpha
=
(
xb2d*
(
xb0d-xc0d
)
+
yb0d*yb2d
+
yc0d*yc2d
)
;
float
beta
=
(
xb2d*
(
yc0d-yb0d
)
+
xb0d*yb2d
+
xc0d*yc2d
)
;
float
gamma
=
xb0d*yb1d
-
xb1d*yb0d
+
xc0d*yc1d
-
xc1d*yc0d
;
mixed
alpha
=
(
xb2d*
(
xb0d-xc0d
)
+
yb0d*yb2d
+
yc0d*yc2d
)
;
mixed
beta
=
(
xb2d*
(
yc0d-yb0d
)
+
xb0d*yb2d
+
xc0d*yc2d
)
;
mixed
gamma
=
xb0d*yb1d
-
xb1d*yb0d
+
xc0d*yc1d
-
xc1d*yc0d
;
float
al2be2
=
alpha*alpha
+
beta*beta
;
float
sintheta
=
(
alpha*gamma
-
beta*sqrt
(
al2be2
-
gamma*gamma
))
/
al2be2
;
mixed
al2be2
=
alpha*alpha
+
beta*beta
;
mixed
sintheta
=
(
alpha*gamma
-
beta*sqrt
(
al2be2
-
gamma*gamma
))
/
al2be2
;
//
---
Step4
A3
'
---
float
costheta
=
sqrt
(
1.0f
-
sintheta*sintheta
)
;
float
xa3d
=
-
ya2d*sintheta
;
float
ya3d
=
ya2d*costheta
;
float
za3d
=
za1d
;
float
xb3d
=
xb2d*costheta
-
yb2d*sintheta
;
float
yb3d
=
xb2d*sintheta
+
yb2d*costheta
;
float
zb3d
=
zb1d
;
float
xc3d
=
-
xb2d*costheta
-
yc2d*sintheta
;
float
yc3d
=
-
xb2d*sintheta
+
yc2d*costheta
;
float
zc3d
=
zc1d
;
mixed
costheta
=
sqrt
(
1.0f
-
sintheta*sintheta
)
;
mixed
xa3d
=
-
ya2d*sintheta
;
mixed
ya3d
=
ya2d*costheta
;
mixed
za3d
=
za1d
;
mixed
xb3d
=
xb2d*costheta
-
yb2d*sintheta
;
mixed
yb3d
=
xb2d*sintheta
+
yb2d*costheta
;
mixed
zb3d
=
zb1d
;
mixed
xc3d
=
-
xb2d*costheta
-
yc2d*sintheta
;
mixed
yc3d
=
-
xb2d*sintheta
+
yc2d*costheta
;
mixed
zc3d
=
zc1d
;
//
---
Step5
A3
---
float
xa3
=
trns11*xa3d
+
trns12*ya3d
+
trns13*za3d
;
float
ya3
=
trns21*xa3d
+
trns22*ya3d
+
trns23*za3d
;
float
za3
=
trns31*xa3d
+
trns32*ya3d
+
trns33*za3d
;
float
xb3
=
trns11*xb3d
+
trns12*yb3d
+
trns13*zb3d
;
float
yb3
=
trns21*xb3d
+
trns22*yb3d
+
trns23*zb3d
;
float
zb3
=
trns31*xb3d
+
trns32*yb3d
+
trns33*zb3d
;
float
xc3
=
trns11*xc3d
+
trns12*yc3d
+
trns13*zc3d
;
float
yc3
=
trns21*xc3d
+
trns22*yc3d
+
trns23*zc3d
;
float
zc3
=
trns31*xc3d
+
trns32*yc3d
+
trns33*zc3d
;
mixed
xa3
=
trns11*xa3d
+
trns12*ya3d
+
trns13*za3d
;
mixed
ya3
=
trns21*xa3d
+
trns22*ya3d
+
trns23*za3d
;
mixed
za3
=
trns31*xa3d
+
trns32*ya3d
+
trns33*za3d
;
mixed
xb3
=
trns11*xb3d
+
trns12*yb3d
+
trns13*zb3d
;
mixed
yb3
=
trns21*xb3d
+
trns22*yb3d
+
trns23*zb3d
;
mixed
zb3
=
trns31*xb3d
+
trns32*yb3d
+
trns33*zb3d
;
mixed
xc3
=
trns11*xc3d
+
trns12*yc3d
+
trns13*zc3d
;
mixed
yc3
=
trns21*xc3d
+
trns22*yc3d
+
trns23*zc3d
;
mixed
zc3
=
trns31*xc3d
+
trns32*yc3d
+
trns33*zc3d
;
xp0.x
=
xcom
+
xa3
;
xp0.y
=
ycom
+
ya3
;
...
...
@@ -155,49 +165,49 @@ __kernel void applySettle(int numClusters, float tol, __global const float4* res
*
Enforce
velocity
constraints
on
SETTLE
clusters
*/
__kernel
void
constrainVelocities
(
int
numClusters,
float
tol,
__global
const
float
4*
restrict
oldPos,
__global
float
4*
restrict
posDelta,
__global
float
4*
restrict
velm,
__global
const
int4*
restrict
clusterAtoms,
__global
const
float2*
restrict
clusterParams
)
{
__kernel
void
constrainVelocities
(
int
numClusters,
mixed
tol,
__global
const
real
4*
restrict
oldPos,
__global
const
real4*
restrict
posCorrection,
__global
mixed
4*
restrict
posDelta,
__global
mixed
4*
restrict
velm,
__global
const
int4*
restrict
clusterAtoms,
__global
const
float2*
restrict
clusterParams
)
{
for
(
int
index
=
get_global_id
(
0
)
; index < numClusters; index += get_global_size(0)) {
//
Load
the
data
for
this
cluster.
int4
atoms
=
clusterAtoms[index]
;
float
4
apos0
=
oldPos[
atoms.x
]
;
float
4
apos1
=
oldPos[
atoms.y
]
;
float
4
apos2
=
oldPos[
atoms.z
]
;
float
4
v0
=
velm[atoms.x]
;
float
4
v1
=
velm[atoms.y]
;
float
4
v2
=
velm[atoms.z]
;
mixed
4
apos0
=
loadPos
(
oldPos,
posCorrection,
atoms.x
)
;
mixed
4
apos1
=
loadPos
(
oldPos,
posCorrection,
atoms.y
)
;
mixed
4
apos2
=
loadPos
(
oldPos,
posCorrection,
atoms.z
)
;
mixed
4
v0
=
velm[atoms.x]
;
mixed
4
v1
=
velm[atoms.y]
;
mixed
4
v2
=
velm[atoms.z]
;
//
Compute
intermediate
quantities:
the
atom
masses,
the
bond
directions,
the
relative
velocities,
//
and
the
angle
cosines
and
sines.
float
mA
=
RECIP
(
v0.w
)
;
float
mB
=
RECIP
(
v1.w
)
;
float
mC
=
RECIP
(
v2.w
)
;
float
4
eAB
=
apos1-apos0
;
float
4
eBC
=
apos2-apos1
;
float
4
eCA
=
apos0-apos2
;
eAB.xyz
/=
SQRT
(
eAB.x*eAB.x
+
eAB.y*eAB.y
+
eAB.z*eAB.z
)
;
eBC.xyz
/=
SQRT
(
eBC.x*eBC.x
+
eBC.y*eBC.y
+
eBC.z*eBC.z
)
;
eCA.xyz
/=
SQRT
(
eCA.x*eCA.x
+
eCA.y*eCA.y
+
eCA.z*eCA.z
)
;
float
vAB
=
(
v1.x-v0.x
)
*eAB.x
+
(
v1.y-v0.y
)
*eAB.y
+
(
v1.z-v0.z
)
*eAB.z
;
float
vBC
=
(
v2.x-v1.x
)
*eBC.x
+
(
v2.y-v1.y
)
*eBC.y
+
(
v2.z-v1.z
)
*eBC.z
;
float
vCA
=
(
v0.x-v2.x
)
*eCA.x
+
(
v0.y-v2.y
)
*eCA.y
+
(
v0.z-v2.z
)
*eCA.z
;
float
cA
=
-
(
eAB.x*eCA.x
+
eAB.y*eCA.y
+
eAB.z*eCA.z
)
;
float
cB
=
-
(
eAB.x*eBC.x
+
eAB.y*eBC.y
+
eAB.z*eBC.z
)
;
float
cC
=
-
(
eBC.x*eCA.x
+
eBC.y*eCA.y
+
eBC.z*eCA.z
)
;
float
s2A
=
1-cA*cA
;
float
s2B
=
1-cB*cB
;
float
s2C
=
1-cC*cC
;
mixed
mA
=
1/
v0.w
;
mixed
mB
=
1/
v1.w
;
mixed
mC
=
1/
v2.w
;
mixed
4
eAB
=
apos1-apos0
;
mixed
4
eBC
=
apos2-apos1
;
mixed
4
eCA
=
apos0-apos2
;
eAB.xyz
/=
sqrt
(
eAB.x*eAB.x
+
eAB.y*eAB.y
+
eAB.z*eAB.z
)
;
eBC.xyz
/=
sqrt
(
eBC.x*eBC.x
+
eBC.y*eBC.y
+
eBC.z*eBC.z
)
;
eCA.xyz
/=
sqrt
(
eCA.x*eCA.x
+
eCA.y*eCA.y
+
eCA.z*eCA.z
)
;
mixed
vAB
=
(
v1.x-v0.x
)
*eAB.x
+
(
v1.y-v0.y
)
*eAB.y
+
(
v1.z-v0.z
)
*eAB.z
;
mixed
vBC
=
(
v2.x-v1.x
)
*eBC.x
+
(
v2.y-v1.y
)
*eBC.y
+
(
v2.z-v1.z
)
*eBC.z
;
mixed
vCA
=
(
v0.x-v2.x
)
*eCA.x
+
(
v0.y-v2.y
)
*eCA.y
+
(
v0.z-v2.z
)
*eCA.z
;
mixed
cA
=
-
(
eAB.x*eCA.x
+
eAB.y*eCA.y
+
eAB.z*eCA.z
)
;
mixed
cB
=
-
(
eAB.x*eBC.x
+
eAB.y*eBC.y
+
eAB.z*eBC.z
)
;
mixed
cC
=
-
(
eBC.x*eCA.x
+
eBC.y*eCA.y
+
eBC.z*eCA.z
)
;
mixed
s2A
=
1-cA*cA
;
mixed
s2B
=
1-cB*cB
;
mixed
s2C
=
1-cC*cC
;
//
Solve
the
equations.
These
are
different
from
those
in
the
SETTLE
paper
(
JCC
13
(
8
)
,
pp.
952-962,
1992
)
,
because
//
in
going
from
equations
B1
to
B2,
they
make
the
assumption
that
mB=mC
(
but
don
't
bother
to
mention
they
're
//
making
that
assumption
)
.
We
allow
all
three
atoms
to
have
different
masses.
float
mABCinv
=
RECIP
(
mA*mB*mC
)
;
float
denom
=
(((
s2A*mB+s2B*mA
)
*mC+
(
s2A*mB*mB+2*
(
cA*cB*cC+1
)
*mA*mB+s2B*mA*
mA
))
*mC+s2C*mA*mB*
(
mA+mB
))
*mABCinv
;
float
tab
=
((
cB*cC*mA-cA*mB-cA*mC
)
*vCA
+
(
cA*cC*mB-cB*mC-cB*mA
)
*vBC
+
(
s2C*mA*mA*mB*mB*mABCinv+
(
mA+mB+mC
))
*vAB
)
/denom
;
float
tbc
=
((
cA*cB*mC-cC*mB-cC*mA
)
*vCA
+
(
s2A*mB*mB*mC*mC*mABCinv+
(
mA+mB+mC
))
*vBC
+
(
cA*cC*mB-cB*mA-cB*mC
)
*vAB
)
/denom
;
float
tca
=
((
s2B*mA*mA*mC*mC*mABCinv+
(
mA+mB+mC
))
*vCA
+
(
cA*cB*mC-cC*mB-cC*mA
)
*vBC
+
(
cB*cC*mA-cA*mB-cA*mC
)
*vAB
)
/denom
;
mixed
mABCinv
=
1/
(
mA*mB*mC
)
;
mixed
denom
=
(((
s2A*mB+s2B*mA
)
*mC+
(
s2A*mB*mB+2*
(
cA*cB*cC+1
)
*mA*mB+s2B*mA*
mA
))
*mC+s2C*mA*mB*
(
mA+mB
))
*mABCinv
;
mixed
tab
=
((
cB*cC*mA-cA*mB-cA*mC
)
*vCA
+
(
cA*cC*mB-cB*mC-cB*mA
)
*vBC
+
(
s2C*mA*mA*mB*mB*mABCinv+
(
mA+mB+mC
))
*vAB
)
/denom
;
mixed
tbc
=
((
cA*cB*mC-cC*mB-cC*mA
)
*vCA
+
(
s2A*mB*mB*mC*mC*mABCinv+
(
mA+mB+mC
))
*vBC
+
(
cA*cC*mB-cB*mA-cB*mC
)
*vAB
)
/denom
;
mixed
tca
=
((
s2B*mA*mA*mC*mC*mABCinv+
(
mA+mB+mC
))
*vCA
+
(
cA*cB*mC-cC*mB-cC*mA
)
*vBC
+
(
cB*cC*mA-cA*mB-cA*mC
)
*vAB
)
/denom
;
v0.xyz
+=
(
tab*eAB.xyz
-
tca*eCA.xyz
)
*v0.w
;
v1.xyz
+=
(
tbc*eBC.xyz
-
tab*eAB.xyz
)
*v1.w
;
v2.xyz
+=
(
tca*eCA.xyz
-
tbc*eBC.xyz
)
*v2.w
;
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment