Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
93c467b2
Commit
93c467b2
authored
Mar 22, 2013
by
Peter Eastman
Browse files
Merged 5.1Optimizations branch back to trunk
parent
f6d4557d
Changes
86
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1083 additions
and
892 deletions
+1083
-892
platforms/cuda/src/kernels/langevin.cu
platforms/cuda/src/kernels/langevin.cu
+4
-4
platforms/cuda/src/kernels/nonbonded.cu
platforms/cuda/src/kernels/nonbonded.cu
+304
-251
platforms/cuda/src/kernels/pme.cu
platforms/cuda/src/kernels/pme.cu
+95
-118
platforms/cuda/src/kernels/sort.cu
platforms/cuda/src/kernels/sort.cu
+42
-0
platforms/cuda/src/kernels/torsionForce.cu
platforms/cuda/src/kernels/torsionForce.cu
+2
-2
platforms/cuda/src/kernels/verlet.cu
platforms/cuda/src/kernels/verlet.cu
+2
-2
platforms/cuda/tests/TestCudaNonbondedForce.cpp
platforms/cuda/tests/TestCudaNonbondedForce.cpp
+8
-6
platforms/cuda/tests/TestCudaSort.cpp
platforms/cuda/tests/TestCudaSort.cpp
+13
-4
platforms/opencl/src/OpenCLBondedUtilities.cpp
platforms/opencl/src/OpenCLBondedUtilities.cpp
+66
-40
platforms/opencl/src/OpenCLContext.cpp
platforms/opencl/src/OpenCLContext.cpp
+10
-9
platforms/opencl/src/OpenCLFFT3D.cpp
platforms/opencl/src/OpenCLFFT3D.cpp
+119
-91
platforms/opencl/src/OpenCLFFT3D.h
platforms/opencl/src/OpenCLFFT3D.h
+2
-1
platforms/opencl/src/OpenCLIntegrationUtilities.cpp
platforms/opencl/src/OpenCLIntegrationUtilities.cpp
+27
-30
platforms/opencl/src/OpenCLIntegrationUtilities.h
platforms/opencl/src/OpenCLIntegrationUtilities.h
+0
-2
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+116
-120
platforms/opencl/src/OpenCLKernels.h
platforms/opencl/src/OpenCLKernels.h
+13
-13
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+209
-164
platforms/opencl/src/OpenCLNonbondedUtilities.h
platforms/opencl/src/OpenCLNonbondedUtilities.h
+34
-16
platforms/opencl/src/OpenCLParallelKernels.cpp
platforms/opencl/src/OpenCLParallelKernels.cpp
+15
-17
platforms/opencl/src/OpenCLParallelKernels.h
platforms/opencl/src/OpenCLParallelKernels.h
+2
-2
No files found.
platforms/cuda/src/kernels/langevin.cu
View file @
93c467b2
...
...
@@ -95,8 +95,8 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error
if
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
==
0
)
{
// Select the new step size.
mixed
totalError
=
sqrt
(
error
[
0
]
/
(
NUM_ATOMS
*
3
));
mixed
newStepSize
=
sqrt
(
errorTol
/
totalError
);
mixed
totalError
=
SQRT
(
error
[
0
]
/
(
NUM_ATOMS
*
3
));
mixed
newStepSize
=
SQRT
(
errorTol
/
totalError
);
mixed
oldStepSize
=
dt
[
0
].
y
;
if
(
oldStepSize
>
0.0
f
)
newStepSize
=
min
(
newStepSize
,
oldStepSize
*
2.0
f
);
// For safety, limit how quickly dt can increase.
...
...
@@ -108,9 +108,9 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error
// Recalculate the integration parameters.
mixed
vscale
=
exp
(
-
newStepSize
/
tau
);
mixed
vscale
=
EXP
(
-
newStepSize
/
tau
);
mixed
fscale
=
(
1
-
vscale
)
*
tau
;
mixed
noisescale
=
sqrt
(
2
*
kT
/
tau
)
*
sqrt
(
0.5
f
*
(
1
-
vscale
*
vscale
)
*
tau
);
mixed
noisescale
=
SQRT
(
2
*
kT
/
tau
)
*
SQRT
(
0.5
f
*
(
1
-
vscale
*
vscale
)
*
tau
);
params
[
VelScale
]
=
vscale
;
params
[
ForceScale
]
=
fscale
;
params
[
NoiseScale
]
=
noisescale
;
...
...
platforms/cuda/src/kernels/nonbonded.cu
View file @
93c467b2
This diff is collapsed.
Click to expand it.
platforms/cuda/src/kernels/pme.cu
View file @
93c467b2
extern
"C"
__global__
void
updateBsplines
(
const
real4
*
__restrict__
posq
,
real4
*
__restrict__
pmeBsplineTheta
,
int2
*
__restrict__
pmeAtomGridIndex
,
extern
"C"
__global__
void
findAtomGridIndex
(
const
real4
*
__restrict__
posq
,
int2
*
__restrict__
pmeAtomGridIndex
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
extern
__shared__
real3
bsplinesCache
[];
real3
*
data
=
&
bsplinesCache
[
threadIdx
.
x
*
PME_ORDER
];
const
real3
scale
=
make_real3
(
RECIP
(
PME_ORDER
-
1
));
// Compute the index of the grid point each atom is associated with.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
real4
pos
=
posq
[
i
];
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
...
...
@@ -11,11 +10,40 @@ extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, real4*
real3
t
=
make_real3
((
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
GRID_SIZE_X
,
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
GRID_SIZE_Y
,
(
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
GRID_SIZE_Z
);
real3
dr
=
make_real3
(
t
.
x
-
(
int
)
t
.
x
,
t
.
y
-
(
int
)
t
.
y
,
t
.
z
-
(
int
)
t
.
z
);
int3
gridIndex
=
make_int3
(((
int
)
t
.
x
)
%
GRID_SIZE_X
,
((
int
)
t
.
y
)
%
GRID_SIZE_Y
,
((
int
)
t
.
z
)
%
GRID_SIZE_Z
);
pmeAtomGridIndex
[
i
]
=
make_int2
(
i
,
gridIndex
.
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
gridIndex
.
y
*
GRID_SIZE_Z
+
gridIndex
.
z
);
}
}
extern
"C"
__global__
void
gridSpreadCharge
(
const
real4
*
__restrict__
posq
,
real
*
__restrict__
originalPmeGrid
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
int2
*
__restrict__
pmeAtomGridIndex
)
{
real3
data
[
PME_ORDER
];
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
// Process the atoms in spatially sorted order. This improves efficiency when writing
// the grid values.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
atom
=
pmeAtomGridIndex
[
i
].
x
;
real
charge
=
posq
[
atom
].
w
;
real3
force
=
make_real3
(
0
);
real4
pos
=
posq
[
atom
];
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
(
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
periodicBoxSize
.
z
;
real3
t
=
make_real3
((
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
GRID_SIZE_X
,
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
GRID_SIZE_Y
,
(
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
GRID_SIZE_Z
);
int3
gridIndex
=
make_int3
(((
int
)
t
.
x
)
%
GRID_SIZE_X
,
((
int
)
t
.
y
)
%
GRID_SIZE_Y
,
((
int
)
t
.
z
)
%
GRID_SIZE_Z
);
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real3
dr
=
make_real3
(
t
.
x
-
(
int
)
t
.
x
,
t
.
y
-
(
int
)
t
.
y
,
t
.
z
-
(
int
)
t
.
z
);
data
[
PME_ORDER
-
1
]
=
make_real3
(
0
);
data
[
1
]
=
dr
;
data
[
0
]
=
make_real3
(
1
)
-
dr
;
...
...
@@ -23,101 +51,49 @@ extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, real4*
real
div
=
RECIP
(
j
-
1
);
data
[
j
-
1
]
=
div
*
dr
*
data
[
j
-
2
];
for
(
int
k
=
1
;
k
<
(
j
-
1
);
k
++
)
data
[
j
-
k
-
1
]
=
div
*
((
dr
+
make_real3
(
k
))
*
data
[
j
-
k
-
2
]
+
(
make_real3
(
j
-
k
)
-
dr
)
*
data
[
j
-
k
-
1
]);
data
[
j
-
k
-
1
]
=
div
*
((
dr
+
make_real3
(
k
))
*
data
[
j
-
k
-
2
]
+
(
make_real3
(
j
-
k
)
-
dr
)
*
data
[
j
-
k
-
1
]);
data
[
0
]
=
div
*
(
make_real3
(
1
)
-
dr
)
*
data
[
0
];
}
data
[
PME_ORDER
-
1
]
=
scale
*
dr
*
data
[
PME_ORDER
-
2
];
for
(
int
j
=
1
;
j
<
(
PME_ORDER
-
1
);
j
++
)
data
[
PME_ORDER
-
j
-
1
]
=
scale
*
((
dr
+
make_real3
(
j
))
*
data
[
PME_ORDER
-
j
-
2
]
+
(
make_real3
(
PME_ORDER
-
j
)
-
dr
)
*
data
[
PME_ORDER
-
j
-
1
]);
data
[
0
]
=
scale
*
(
make_real3
(
1
)
-
dr
)
*
data
[
0
];
for
(
int
j
=
0
;
j
<
PME_ORDER
;
j
++
)
{
real3
d
=
data
[
j
];
// Copy it as a workaround for a bug in CUDA 5.0
pmeBsplineTheta
[
i
+
j
*
NUM_ATOMS
]
=
make_real4
(
d
.
x
,
d
.
y
,
d
.
z
,
pos
.
w
);
// Storing the charge here improves cache coherency in the charge spreading kernel
}
}
}
/**
* For each grid point, find the range of sorted atoms associated with that point.
*/
extern
"C"
__global__
void
findAtomRangeForGrid
(
int2
*
__restrict__
pmeAtomGridIndex
,
int
*
__restrict__
pmeAtomRange
,
const
real4
*
__restrict__
posq
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
int
start
=
(
NUM_ATOMS
*
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
))
/
(
blockDim
.
x
*
gridDim
.
x
);
int
end
=
(
NUM_ATOMS
*
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
+
1
))
/
(
blockDim
.
x
*
gridDim
.
x
);
int
last
=
(
start
==
0
?
-
1
:
pmeAtomGridIndex
[
start
-
1
].
y
);
for
(
int
i
=
start
;
i
<
end
;
++
i
)
{
int2
atomData
=
pmeAtomGridIndex
[
i
];
int
gridIndex
=
atomData
.
y
;
if
(
gridIndex
!=
last
)
{
for
(
int
j
=
last
+
1
;
j
<=
gridIndex
;
++
j
)
pmeAtomRange
[
j
]
=
i
;
last
=
gridIndex
;
}
}
// Spread the charge from this atom onto each grid point.
// Fill in values beyond the last atom.
for
(
int
ix
=
0
;
ix
<
PME_ORDER
;
ix
++
)
{
int
xbase
=
gridIndex
.
x
+
ix
;
xbase
-=
(
xbase
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
xbase
=
xbase
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
real
dx
=
data
[
ix
].
x
;
if
(
blockIdx
.
x
==
gridDim
.
x
-
1
&&
threadIdx
.
x
==
blockDim
.
x
-
1
)
{
int
gridSize
=
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
;
for
(
int
j
=
last
+
1
;
j
<=
gridSize
;
++
j
)
pmeAtomRange
[
j
]
=
NUM_ATOMS
;
}
}
for
(
int
iy
=
0
;
iy
<
PME_ORDER
;
iy
++
)
{
int
ybase
=
gridIndex
.
y
+
iy
;
ybase
-=
(
ybase
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
ybase
=
xbase
+
ybase
*
GRID_SIZE_Z
;
real
dy
=
data
[
iy
].
y
;
#define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER)
extern
"C"
__global__
void
gridSpreadCharge
(
const
real4
*
__restrict__
posq
,
real
*
__restrict__
originalPmeGrid
,
const
real4
*
__restrict__
pmeBsplineTheta
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
int
ix
=
threadIdx
.
x
/
(
PME_ORDER
*
PME_ORDER
);
int
remainder
=
threadIdx
.
x
-
ix
*
PME_ORDER
*
PME_ORDER
;
int
iy
=
remainder
/
PME_ORDER
;
int
iz
=
remainder
-
iy
*
PME_ORDER
;
__shared__
real4
theta
[
PME_ORDER
];
__shared__
real
charge
[
BUFFER_SIZE
];
__shared__
int
basex
[
BUFFER_SIZE
];
__shared__
int
basey
[
BUFFER_SIZE
];
__shared__
int
basez
[
BUFFER_SIZE
];
if
(
ix
<
PME_ORDER
)
{
for
(
int
baseIndex
=
blockIdx
.
x
*
BUFFER_SIZE
;
baseIndex
<
NUM_ATOMS
;
baseIndex
+=
gridDim
.
x
*
BUFFER_SIZE
)
{
// Load the next block of atoms into the buffers.
for
(
int
iz
=
0
;
iz
<
PME_ORDER
;
iz
++
)
{
int
zindex
=
gridIndex
.
z
+
iz
;
zindex
-=
(
zindex
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
int
index
=
ybase
+
zindex
;
int
atomIndex
=
baseIndex
+
threadIdx
.
x
;
if
(
atomIndex
<
NUM_ATOMS
)
{
real4
pos
=
posq
[
atomIndex
];
charge
[
threadIdx
.
x
]
=
pos
.
w
;
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
pos
.
y
-=
floor
(
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
periodicBoxSize
.
y
;
pos
.
z
-=
floor
(
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
periodicBoxSize
.
z
;
basex
[
threadIdx
.
x
]
=
(
int
)
((
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
GRID_SIZE_X
);
basey
[
threadIdx
.
x
]
=
(
int
)
((
pos
.
y
*
invPeriodicBoxSize
.
y
)
*
GRID_SIZE_Y
);
basez
[
threadIdx
.
x
]
=
(
int
)
((
pos
.
z
*
invPeriodicBoxSize
.
z
)
*
GRID_SIZE_Z
);
}
__syncthreads
();
int
lastIndex
=
min
(
BUFFER_SIZE
,
NUM_ATOMS
-
baseIndex
);
for
(
int
index
=
0
;
index
<
lastIndex
;
index
++
)
{
int
atomIndex
=
index
+
baseIndex
;
if
(
threadIdx
.
x
<
PME_ORDER
)
theta
[
threadIdx
.
x
]
=
pmeBsplineTheta
[
atomIndex
+
threadIdx
.
x
*
NUM_ATOMS
];
__syncthreads
();
real
add
=
charge
[
index
]
*
theta
[
ix
].
x
*
theta
[
iy
].
y
*
theta
[
iz
].
z
;
int
x
=
basex
[
index
]
+
ix
;
int
y
=
basey
[
index
]
+
iy
;
int
z
=
basez
[
index
]
+
iz
;
x
-=
(
x
>=
GRID_SIZE_X
?
GRID_SIZE_X
:
0
);
y
-=
(
y
>=
GRID_SIZE_Y
?
GRID_SIZE_Y
:
0
);
z
-=
(
z
>=
GRID_SIZE_Z
?
GRID_SIZE_Z
:
0
);
real
add
=
charge
*
dx
*
dy
*
data
[
iz
].
z
;
#ifdef USE_DOUBLE_PRECISION
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
originalPmeGrid
;
atomicAdd
(
&
ulonglong_p
[
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
+
z
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add
*
0x100000000
)));
atomicAdd
(
&
ulonglong_p
[
index
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add
*
0x100000000
)));
#elif __CUDA_ARCH__ < 200
unsigned
long
long
*
ulonglong_p
=
(
unsigned
long
long
*
)
originalPmeGrid
;
int
gridIndex
=
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
+
z
;
int
gridIndex
=
index
;
gridIndex
=
(
gridIndex
%
2
==
0
?
gridIndex
/
2
:
(
gridIndex
+
GRID_SIZE_X
*
GRID_SIZE_Y
*
GRID_SIZE_Z
)
/
2
);
atomicAdd
(
&
ulonglong_p
[
gridIndex
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
add
*
0x100000000
)));
#else
atomicAdd
(
&
originalPmeGrid
[
x
*
GRID_SIZE_Y
*
GRID_SIZE_Z
+
y
*
GRID_SIZE_Z
+
z
],
add
*
EPSILON_FACTOR
);
atomicAdd
(
&
originalPmeGrid
[
index
],
add
*
EPSILON_FACTOR
);
#endif
}
}
}
}
}
extern
"C"
__global__
void
finishSpreadCharge
(
long
long
*
__restrict__
originalPmeGrid
)
{
...
...
@@ -218,12 +194,16 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict__ e
extern
"C"
__global__
void
gridInterpolateForce
(
const
real4
*
__restrict__
posq
,
unsigned
long
long
*
__restrict__
forceBuffers
,
const
real
*
__restrict__
originalPmeGrid
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
)
{
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
const
int2
*
__restrict__
pmeAtomGridIndex
)
{
real3
data
[
PME_ORDER
];
real3
ddata
[
PME_ORDER
];
const
real
scale
=
RECIP
(
PME_ORDER
-
1
);
for
(
int
atom
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
atom
<
NUM_ATOMS
;
atom
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for
(
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
NUM_ATOMS
;
i
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
atom
=
pmeAtomGridIndex
[
i
].
x
;
real3
force
=
make_real3
(
0
);
real4
pos
=
posq
[
atom
];
pos
.
x
-=
floor
(
pos
.
x
*
invPeriodicBoxSize
.
x
)
*
periodicBoxSize
.
x
;
...
...
@@ -243,7 +223,6 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
data
[
PME_ORDER
-
1
]
=
make_real3
(
0
);
data
[
1
]
=
dr
;
data
[
0
]
=
make_real3
(
1
)
-
dr
;
for
(
int
j
=
3
;
j
<
PME_ORDER
;
j
++
)
{
real
div
=
RECIP
(
j
-
1
);
data
[
j
-
1
]
=
div
*
dr
*
data
[
j
-
2
];
...
...
@@ -252,11 +231,9 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
data
[
0
]
=
div
*
(
make_real3
(
1
)
-
dr
)
*
data
[
0
];
}
ddata
[
0
]
=
-
data
[
0
];
for
(
int
j
=
1
;
j
<
PME_ORDER
;
j
++
)
ddata
[
j
]
=
data
[
j
-
1
]
-
data
[
j
];
data
[
PME_ORDER
-
1
]
=
scale
*
dr
*
data
[
PME_ORDER
-
2
];
for
(
int
j
=
1
;
j
<
(
PME_ORDER
-
1
);
j
++
)
data
[
PME_ORDER
-
j
-
1
]
=
scale
*
((
dr
+
make_real3
(
j
))
*
data
[
PME_ORDER
-
j
-
2
]
+
(
make_real3
(
PME_ORDER
-
j
)
-
dr
)
*
data
[
PME_ORDER
-
j
-
1
]);
data
[
0
]
=
scale
*
(
make_real3
(
1
)
-
dr
)
*
data
[
0
];
...
...
platforms/cuda/src/kernels/sort.cu
View file @
93c467b2
...
...
@@ -4,6 +4,48 @@ __device__ KEY_TYPE getValue(DATA_TYPE value) {
extern
"C"
{
/**
* Sort a list that is short enough to entirely fit in local memory. This is executed as
* a single thread block.
*/
__global__
void
sortShortList
(
DATA_TYPE
*
__restrict__
data
,
unsigned
int
length
)
{
// Load the data into local memory.
extern
__shared__
DATA_TYPE
dataBuffer
[];
for
(
int
index
=
threadIdx
.
x
;
index
<
length
;
index
+=
blockDim
.
x
)
dataBuffer
[
index
]
=
data
[
index
];
__syncthreads
();
// Perform a bitonic sort in local memory.
for
(
unsigned
int
k
=
2
;
k
<
2
*
length
;
k
*=
2
)
{
for
(
unsigned
int
j
=
k
/
2
;
j
>
0
;
j
/=
2
)
{
for
(
unsigned
int
i
=
threadIdx
.
x
;
i
<
length
;
i
+=
blockDim
.
x
)
{
int
ixj
=
i
^
j
;
if
(
ixj
>
i
&&
ixj
<
length
)
{
DATA_TYPE
value1
=
dataBuffer
[
i
];
DATA_TYPE
value2
=
dataBuffer
[
ixj
];
bool
ascending
=
((
i
&
k
)
==
0
);
for
(
unsigned
int
mask
=
k
*
2
;
mask
<
2
*
length
;
mask
*=
2
)
ascending
=
((
i
&
mask
)
==
0
?
!
ascending
:
ascending
);
KEY_TYPE
lowKey
=
(
ascending
?
getValue
(
value1
)
:
getValue
(
value2
));
KEY_TYPE
highKey
=
(
ascending
?
getValue
(
value2
)
:
getValue
(
value1
));
if
(
lowKey
>
highKey
)
{
dataBuffer
[
i
]
=
value2
;
dataBuffer
[
ixj
]
=
value1
;
}
}
}
__syncthreads
();
}
}
// Write the data back to global memory.
for
(
int
index
=
threadIdx
.
x
;
index
<
length
;
index
+=
blockDim
.
x
)
data
[
index
]
=
dataBuffer
[
index
];
}
/**
* Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group.
...
...
platforms/cuda/src/kernels/torsionForce.cu
View file @
93c467b2
...
...
@@ -16,12 +16,12 @@ if (cosangle > 0.99f || cosangle < -0.99f) {
theta
=
PI
-
theta
;
}
else
theta
=
acos
(
cosangle
);
theta
=
ACOS
(
cosangle
);
theta
=
(
dot
(
v0
,
cp1
)
>=
0
?
theta
:
-
theta
);
COMPUTE_FORCE
real
normCross1
=
dot
(
cp0
,
cp0
);
real
normSqrBC
=
dot
(
v1
,
v1
);
real
normBC
=
sqrt
(
normSqrBC
);
real
normBC
=
SQRT
(
normSqrBC
);
real
normCross2
=
dot
(
cp1
,
cp1
);
real
dp
=
RECIP
(
normSqrBC
);
real4
ff
=
make_real4
((
-
dEdAngle
*
normBC
)
/
normCross1
,
dot
(
v0
,
v1
)
*
dp
,
dot
(
v2
,
v1
)
*
dp
,
(
dEdAngle
*
normBC
)
/
normCross2
);
...
...
platforms/cuda/src/kernels/verlet.cu
View file @
93c467b2
...
...
@@ -93,8 +93,8 @@ extern "C" __global__ void selectVerletStepSize(mixed maxStepSize, mixed errorTo
__syncthreads
();
}
if
(
threadIdx
.
x
==
0
)
{
mixed
totalError
=
sqrt
(
error
[
0
]
/
(
NUM_ATOMS
*
3
));
mixed
newStepSize
=
sqrt
(
errorTol
/
totalError
);
mixed
totalError
=
SQRT
(
error
[
0
]
/
(
NUM_ATOMS
*
3
));
mixed
newStepSize
=
SQRT
(
errorTol
/
totalError
);
mixed
oldStepSize
=
dt
[
0
].
y
;
if
(
oldStepSize
>
0.0
f
)
newStepSize
=
min
(
newStepSize
,
oldStepSize
*
2.0
f
);
// For safety, limit how quickly dt can increase.
...
...
platforms/cuda/tests/TestCudaNonbondedForce.cpp
View file @
93c467b2
...
...
@@ -438,9 +438,9 @@ void testLargeSystem() {
}
ASSERT_EQUAL_TOL
(
cuState
.
getPotentialEnergy
(),
referenceState
.
getPotentialEnergy
(),
tol
);
}
/*
void testBlockInteractions(bool periodic) {
const
int
blockSize
=
32
;
const int blockSize =
CudaContext::TileSize
;
const int numBlocks = 100;
const int numParticles = blockSize*numBlocks;
const double cutoff = 1.0;
...
...
@@ -597,6 +597,8 @@ void testBlockInteractions(bool periodic) {
if (!hasInteractions[i]) {
unsigned int y = (unsigned int) std::floor(numBlocks+0.5-std::sqrt((numBlocks+0.5)*(numBlocks+0.5)-2*i));
unsigned int x = (i-y*numBlocks+y*(y+1)/2);
if (x == y)
continue; // This block has exclusions, so it will not be in the neighbor list.
for (int atom1 = 0; atom1 < blockSize; ++atom1) {
double4 pos1 = posq[x*blockSize+atom1];
for (int atom2 = 0; atom2 < blockSize; ++atom2) {
...
...
@@ -613,14 +615,14 @@ void testBlockInteractions(bool periodic) {
}
}
}
}
}
*/
void
testDispersionCorrection
()
{
// Create a box full of identical particles.
int
gridSize
=
5
;
int
numParticles
=
gridSize
*
gridSize
*
gridSize
;
double
boxSize
=
gridSize
*
0.
5
;
double
boxSize
=
gridSize
*
0.
7
;
double
cutoff
=
boxSize
/
3
;
System
system
;
VerletIntegrator
integrator
(
0.01
);
...
...
@@ -822,8 +824,8 @@ int main(int argc, char* argv[]) {
testCutoff14
();
testPeriodic
();
testLargeSystem
();
testBlockInteractions
(
false
);
testBlockInteractions
(
true
);
//
testBlockInteractions(false);
//
testBlockInteractions(true);
testDispersionCorrection
();
testChangingParameters
();
testParallelComputation
(
false
);
...
...
platforms/cuda/tests/TestCudaSort.cpp
View file @
93c467b2
...
...
@@ -87,8 +87,7 @@ void verifySorting(vector<float> array) {
ASSERT
(
elements1
==
elements2
);
}
void
testUniformValues
()
{
void
testUniformValues
()
{
OpenMM_SFMT
::
SFMT
sfmt
;
init_gen_rand
(
0
,
sfmt
);
...
...
@@ -98,8 +97,7 @@ void testUniformValues()
verifySorting
(
array
);
}
void
testLogValues
()
{
void
testLogValues
()
{
OpenMM_SFMT
::
SFMT
sfmt
;
init_gen_rand
(
0
,
sfmt
);
...
...
@@ -109,12 +107,23 @@ void testLogValues()
verifySorting
(
array
);
}
void
testShortList
()
{
OpenMM_SFMT
::
SFMT
sfmt
;
init_gen_rand
(
0
,
sfmt
);
vector
<
float
>
array
(
500
);
for
(
int
i
=
0
;
i
<
(
int
)
array
.
size
();
i
++
)
array
[
i
]
=
(
float
)
log
(
genrand_real2
(
sfmt
));
verifySorting
(
array
);
}
int
main
(
int
argc
,
char
*
argv
[])
{
try
{
if
(
argc
>
1
)
platform
.
setPropertyDefaultValue
(
"CudaPrecision"
,
string
(
argv
[
1
]));
testUniformValues
();
testLogValues
();
testShortList
();
}
catch
(
const
exception
&
e
)
{
cout
<<
"exception: "
<<
e
.
what
()
<<
endl
;
...
...
platforms/opencl/src/OpenCLBondedUtilities.cpp
View file @
93c467b2
...
...
@@ -99,6 +99,18 @@ void OpenCLBondedUtilities::initialize(const System& system) {
numBuffers
[
i
]
=
max
(
numBuffers
[
i
],
bufferCounter
[
i
][
j
]);
}
// For efficiency, we want to merge multiple forces into a single kernel - but only if that
// won't increase the number of force buffers.
if
(
context
.
getSupports64BitGlobalAtomics
())
{
// Put all the forces in the same set.
numForceBuffers
=
1
;
forceSets
.
push_back
(
vector
<
int
>
());
for
(
int
i
=
0
;
i
<
numForces
;
i
++
)
forceSets
[
0
].
push_back
(
i
);
}
else
{
// Figure out how many force buffers will be required.
for
(
int
i
=
0
;
i
<
numForces
;
i
++
)
...
...
@@ -107,8 +119,7 @@ void OpenCLBondedUtilities::initialize(const System& system) {
if
(
context
.
getNonbondedUtilities
().
getHasInteractions
())
bufferLimit
=
max
(
bufferLimit
,
context
.
getNonbondedUtilities
().
getNumForceBuffers
());
// For efficiency, we want to merge multiple forces into a single kernel - but only if that
// won't increase the number of force buffers. Figure out sets of forces that can be merged.
// Figure out sets of forces that can be merged.
vector
<
int
>
unmerged
(
numForces
);
for
(
int
i
=
0
;
i
<
numForces
;
i
++
)
...
...
@@ -137,6 +148,7 @@ void OpenCLBondedUtilities::initialize(const System& system) {
unmerged
.
erase
(
unmerged
.
begin
());
unmerged
.
pop_back
();
}
}
// Update the buffer indices based on merged sets.
...
...
@@ -162,9 +174,13 @@ void OpenCLBondedUtilities::initialize(const System& system) {
const
vector
<
int
>&
set
=
*
iter
;
int
setSize
=
set
.
size
();
stringstream
s
;
s
<<
"#ifdef SUPPORTS_64_BIT_ATOMICS
\n
"
;
s
<<
"#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
\n
"
;
s
<<
"#endif
\n
"
;
for
(
int
i
=
0
;
i
<
(
int
)
prefixCode
.
size
();
i
++
)
s
<<
prefixCode
[
i
];
s
<<
"__kernel void computeBondedForces(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, int groups"
;
string
bufferType
=
(
context
.
getSupports64BitGlobalAtomics
()
?
"long"
:
"real4"
);
s
<<
"__kernel void computeBondedForces(__global "
<<
bufferType
<<
"* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, int groups"
;
for
(
int
i
=
0
;
i
<
setSize
;
i
++
)
{
int
force
=
set
[
i
];
string
indexType
=
"uint"
+
(
indexWidth
[
force
]
==
1
?
""
:
context
.
intToString
(
indexWidth
[
force
]));
...
...
@@ -219,10 +235,17 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in
s
<<
computeForce
<<
"
\n
"
;
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
s
<<
" {
\n
"
;
if
(
context
.
getSupports64BitGlobalAtomics
())
{
s
<<
" atom_add(&forceBuffers[atom"
<<
(
i
+
1
)
<<
"], (long) (force"
<<
(
i
+
1
)
<<
".x*0x100000000));
\n
"
;
s
<<
" atom_add(&forceBuffers[atom"
<<
(
i
+
1
)
<<
"+PADDED_NUM_ATOMS], (long) (force"
<<
(
i
+
1
)
<<
".y*0x100000000));
\n
"
;
s
<<
" atom_add(&forceBuffers[atom"
<<
(
i
+
1
)
<<
"+2*PADDED_NUM_ATOMS], (long) (force"
<<
(
i
+
1
)
<<
".z*0x100000000));
\n
"
;
}
else
{
s
<<
" unsigned int offset = atom"
<<
(
i
+
1
)
<<
"+buffers"
<<
suffix
[
i
]
<<
"*PADDED_NUM_ATOMS;
\n
"
;
s
<<
" real4 force = forceBuffers[offset];
\n
"
;
s
<<
" force.xyz += force"
<<
(
i
+
1
)
<<
".xyz;
\n
"
;
s
<<
" forceBuffers[offset] = force;
\n
"
;
}
s
<<
" }
\n
"
;
}
s
<<
"}
\n
"
;
...
...
@@ -235,6 +258,9 @@ void OpenCLBondedUtilities::computeInteractions(int groups) {
for
(
int
i
=
0
;
i
<
(
int
)
forceSets
.
size
();
i
++
)
{
int
index
=
0
;
cl
::
Kernel
&
kernel
=
kernels
[
i
];
if
(
context
.
getSupports64BitGlobalAtomics
())
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
context
.
getLongForceBuffer
().
getDeviceBuffer
());
else
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
context
.
getForceBuffers
().
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
context
.
getEnergyBuffer
().
getDeviceBuffer
());
kernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
context
.
getPosq
().
getDeviceBuffer
());
...
...
platforms/opencl/src/OpenCLContext.cpp
View file @
93c467b2
...
...
@@ -97,6 +97,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
// Try to figure out which device is the fastest.
int
bestSpeed
=
-
1
;
bool
bestSupportsDouble
=
false
;
for
(
int
i
=
0
;
i
<
(
int
)
devices
.
size
();
i
++
)
{
if
(
platformVendor
==
"Apple"
&&
devices
[
i
].
getInfo
<
CL_DEVICE_VENDOR
>
()
==
"AMD"
)
continue
;
// Don't use AMD GPUs on OS X due to serious bugs.
...
...
@@ -135,9 +136,11 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
}
}
int
speed
=
devices
[
i
].
getInfo
<
CL_DEVICE_MAX_COMPUTE_UNITS
>
()
*
processingElementsPerComputeUnit
*
devices
[
i
].
getInfo
<
CL_DEVICE_MAX_CLOCK_FREQUENCY
>
();
if
(
maxSize
>=
minThreadBlockSize
&&
speed
>
bestSpeed
)
{
bool
supportsDouble
=
(
devices
[
i
].
getInfo
<
CL_DEVICE_EXTENSIONS
>
().
find
(
"cl_khr_fp64"
)
!=
string
::
npos
);
if
(
maxSize
>=
minThreadBlockSize
&&
speed
>
bestSpeed
&&
(
supportsDouble
||
!
bestSupportsDouble
))
{
deviceIndex
=
i
;
bestSpeed
=
speed
;
bestSupportsDouble
=
supportsDouble
;
}
}
}
...
...
@@ -173,9 +176,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
}
}
else
if
(
vendor
.
size
()
>=
28
&&
vendor
.
substr
(
0
,
28
)
==
"Advanced Micro Devices, Inc."
)
{
// Disable 64 bit atomics. A future version of the driver will support them, but until we can test that,
// it's safest not to use them.
supports64BitGlobalAtomics
=
false
;
if
(
device
.
getInfo
<
CL_DEVICE_TYPE
>
()
!=
CL_DEVICE_TYPE_GPU
)
{
/// \todo Is 6 a good value for the OpenCL CPU device?
// numThreadBlocksPerComputeUnit = ?;
...
...
@@ -190,14 +190,11 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
// check for errors.
try
{
#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
// AMD has both 32 and 64 width SIMDs. Can determine by using:
// simdWidth = device.getInfo<CL_DEVICE_WAVEFRONT_WIDTH_AMD>();
// Must catch cl:Error as will fail if runtime does not support queries.
// However, the 32 width NVIDIA kernels do not have all the necessary
// barriers and so will not work for AMD.
// So for now leave default of 1 which will use the default kernels.
cl_uint
simdPerComputeUnit
=
device
.
getInfo
<
CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
>
();
simdWidth
=
device
.
getInfo
<
CL_DEVICE_WAVEFRONT_WIDTH_AMD
>
();
// If the GPU has multiple SIMDs per compute unit then it is uses the scalar instruction
// set instead of the VLIW instruction set. It therefore needs more thread blocks per
// compute unit to hide memory latency.
...
...
@@ -226,6 +223,10 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
compilationDefines
[
"SUPPORTS_64_BIT_ATOMICS"
]
=
""
;
if
(
supportsDoublePrecision
)
compilationDefines
[
"SUPPORTS_DOUBLE_PRECISION"
]
=
""
;
if
(
simdWidth
>=
32
)
compilationDefines
[
"SYNC_WARPS"
]
=
""
;
else
compilationDefines
[
"SYNC_WARPS"
]
=
"barrier(CLK_LOCAL_MEM_FENCE)"
;
vector
<
cl
::
Device
>
contextDevices
;
contextDevices
.
push_back
(
device
);
cl_context_properties
cprops
[]
=
{
CL_CONTEXT_PLATFORM
,
(
cl_context_properties
)
platforms
[
platformIndex
](),
0
};
...
...
platforms/opencl/src/OpenCLFFT3D.cpp
View file @
93c467b2
...
...
@@ -36,27 +36,24 @@ using namespace OpenMM;
using
namespace
std
;
OpenCLFFT3D
::
OpenCLFFT3D
(
OpenCLContext
&
context
,
int
xsize
,
int
ysize
,
int
zsize
)
:
context
(
context
),
xsize
(
xsize
),
ysize
(
ysize
),
zsize
(
zsize
)
{
zkernel
=
createKernel
(
xsize
,
ysize
,
zsize
);
xkernel
=
createKernel
(
ysize
,
zsize
,
xsize
);
ykernel
=
createKernel
(
zsize
,
xsize
,
ysize
);
zkernel
=
createKernel
(
xsize
,
ysize
,
zsize
,
zthreads
);
xkernel
=
createKernel
(
ysize
,
zsize
,
xsize
,
xthreads
);
ykernel
=
createKernel
(
zsize
,
xsize
,
ysize
,
ythreads
);
}
void
OpenCLFFT3D
::
execFFT
(
OpenCLArray
&
in
,
OpenCLArray
&
out
,
bool
forward
)
{
int
maxSize
=
xkernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
());
if
(
context
.
getDevice
().
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
)
maxSize
=
1
;
zkernel
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
zkernel
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
zkernel
.
setArg
<
cl_int
>
(
2
,
forward
?
1
:
-
1
);
context
.
executeKernel
(
zkernel
,
xsize
*
ysize
*
zsize
,
min
(
zsize
,
(
int
)
maxSize
)
);
context
.
executeKernel
(
zkernel
,
xsize
*
ysize
*
zsize
,
zthreads
);
xkernel
.
setArg
<
cl
::
Buffer
>
(
0
,
out
.
getDeviceBuffer
());
xkernel
.
setArg
<
cl
::
Buffer
>
(
1
,
in
.
getDeviceBuffer
());
xkernel
.
setArg
<
cl_int
>
(
2
,
forward
?
1
:
-
1
);
context
.
executeKernel
(
xkernel
,
xsize
*
ysize
*
zsize
,
min
(
xsize
,
(
int
)
maxSize
)
);
context
.
executeKernel
(
xkernel
,
xsize
*
ysize
*
zsize
,
xthreads
);
ykernel
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
ykernel
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
ykernel
.
setArg
<
cl_int
>
(
2
,
forward
?
1
:
-
1
);
context
.
executeKernel
(
ykernel
,
xsize
*
ysize
*
zsize
,
min
(
ysize
,
(
int
)
maxSize
)
);
context
.
executeKernel
(
ykernel
,
xsize
*
ysize
*
zsize
,
ythreads
);
}
int
OpenCLFFT3D
::
findLegalDimension
(
int
minimum
)
{
...
...
@@ -66,7 +63,7 @@ int OpenCLFFT3D::findLegalDimension(int minimum) {
// Attempt to factor the current value.
int
unfactored
=
minimum
;
for
(
int
factor
=
2
;
factor
<
6
;
factor
++
)
{
for
(
int
factor
=
2
;
factor
<
8
;
factor
++
)
{
while
(
unfactored
>
1
&&
unfactored
%
factor
==
0
)
unfactored
/=
factor
;
}
...
...
@@ -76,9 +73,10 @@ int OpenCLFFT3D::findLegalDimension(int minimum) {
}
}
cl
::
Kernel
OpenCLFFT3D
::
createKernel
(
int
xsize
,
int
ysize
,
int
zsize
)
{
cl
::
Kernel
OpenCLFFT3D
::
createKernel
(
int
xsize
,
int
ysize
,
int
zsize
,
int
&
threads
)
{
bool
loopRequired
=
(
context
.
getDevice
().
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
);
stringstream
source
;
int
blocksPerGroup
=
(
loopRequired
?
1
:
max
(
1
,
256
/
zsize
));
int
stage
=
0
;
int
L
=
zsize
;
int
m
=
1
;
...
...
@@ -88,22 +86,85 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
while
(
L
>
1
)
{
int
input
=
stage
%
2
;
int
output
=
1
-
input
;
int
radix
;
if
(
L
%
7
==
0
)
radix
=
7
;
else
if
(
L
%
5
==
0
)
radix
=
5
;
else
if
(
L
%
4
==
0
)
radix
=
4
;
else
if
(
L
%
3
==
0
)
radix
=
3
;
else
if
(
L
%
2
==
0
)
radix
=
2
;
else
throw
OpenMMException
(
"Illegal size for FFT: "
+
context
.
intToString
(
zsize
));
source
<<
"{
\n
"
;
if
(
L
%
5
==
0
)
{
L
=
L
/
5
;
source
<<
"// Pass "
<<
(
stage
+
1
)
<<
" (radix 5)
\n
"
;
if
(
loopRequired
)
L
=
L
/
radix
;
source
<<
"// Pass "
<<
(
stage
+
1
)
<<
" (radix "
<<
radix
<<
")
\n
"
;
if
(
loopRequired
)
{
source
<<
"for (int i = get_local_id(0); i < "
<<
(
L
*
m
)
<<
"; i += get_local_size(0)) {
\n
"
;
source
<<
"int base = i;
\n
"
;
}
else
{
source
<<
"if (get_local_id(0) < "
<<
(
L
*
m
)
<<
") {
\n
"
;
source
<<
"int i = get_local_id(0);
\n
"
;
source
<<
"if (get_local_id(0) < "
<<
(
blocksPerGroup
*
L
*
m
)
<<
") {
\n
"
;
source
<<
"int block = get_local_id(0)/"
<<
(
L
*
m
)
<<
";
\n
"
;
source
<<
"int i = get_local_id(0)-block*"
<<
(
L
*
m
)
<<
";
\n
"
;
source
<<
"int base = i+block*"
<<
zsize
<<
";
\n
"
;
}
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
source
<<
"real2 c0 = data"
<<
input
<<
"[i];
\n
"
;
source
<<
"real2 c1 = data"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c2 = data"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c3 = data"
<<
input
<<
"[i+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c4 = data"
<<
input
<<
"[i+"
<<
(
4
*
L
*
m
)
<<
"];
\n
"
;
if
(
radix
==
7
)
{
source
<<
"real2 c0 = data"
<<
input
<<
"[base];
\n
"
;
source
<<
"real2 c1 = data"
<<
input
<<
"[base+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c2 = data"
<<
input
<<
"[base+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c3 = data"
<<
input
<<
"[base+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c4 = data"
<<
input
<<
"[base+"
<<
(
4
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c5 = data"
<<
input
<<
"[base+"
<<
(
5
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c6 = data"
<<
input
<<
"[base+"
<<
(
6
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 d0 = c1+c6;
\n
"
;
source
<<
"real2 d1 = c1-c6;
\n
"
;
source
<<
"real2 d2 = c2+c5;
\n
"
;
source
<<
"real2 d3 = c2-c5;
\n
"
;
source
<<
"real2 d4 = c4+c3;
\n
"
;
source
<<
"real2 d5 = c4-c3;
\n
"
;
source
<<
"real2 d6 = d2+d0;
\n
"
;
source
<<
"real2 d7 = d5+d3;
\n
"
;
source
<<
"real2 b0 = c0+d6+d4;
\n
"
;
source
<<
"real2 b1 = "
<<
context
.
doubleToString
((
cos
(
2
*
M_PI
/
7
)
+
cos
(
4
*
M_PI
/
7
)
+
cos
(
6
*
M_PI
/
7
))
/
3
-
1
)
<<
"*(d6+d4);
\n
"
;
source
<<
"real2 b2 = "
<<
context
.
doubleToString
((
2
*
cos
(
2
*
M_PI
/
7
)
-
cos
(
4
*
M_PI
/
7
)
-
cos
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d0-d4);
\n
"
;
source
<<
"real2 b3 = "
<<
context
.
doubleToString
((
cos
(
2
*
M_PI
/
7
)
-
2
*
cos
(
4
*
M_PI
/
7
)
+
cos
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d4-d2);
\n
"
;
source
<<
"real2 b4 = "
<<
context
.
doubleToString
((
cos
(
2
*
M_PI
/
7
)
+
cos
(
4
*
M_PI
/
7
)
-
2
*
cos
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d2-d0);
\n
"
;
source
<<
"real2 b5 = -sign*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
+
sin
(
4
*
M_PI
/
7
)
-
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d7+d1);
\n
"
;
source
<<
"real2 b6 = -sign*"
<<
context
.
doubleToString
((
2
*
sin
(
2
*
M_PI
/
7
)
-
sin
(
4
*
M_PI
/
7
)
+
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d1-d5);
\n
"
;
source
<<
"real2 b7 = -sign*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
-
2
*
sin
(
4
*
M_PI
/
7
)
-
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d5-d3);
\n
"
;
source
<<
"real2 b8 = -sign*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
+
sin
(
4
*
M_PI
/
7
)
+
2
*
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d3-d1);
\n
"
;
source
<<
"real2 t0 = b0+b1;
\n
"
;
source
<<
"real2 t1 = b2+b3;
\n
"
;
source
<<
"real2 t2 = b4-b3;
\n
"
;
source
<<
"real2 t3 = -b2-b4;
\n
"
;
source
<<
"real2 t4 = b6+b7;
\n
"
;
source
<<
"real2 t5 = b8-b7;
\n
"
;
source
<<
"real2 t6 = -b8-b6;
\n
"
;
source
<<
"real2 t7 = t0+t1;
\n
"
;
source
<<
"real2 t8 = t0+t2;
\n
"
;
source
<<
"real2 t9 = t0+t3;
\n
"
;
source
<<
"real2 t10 = (real2) (t4.y+b5.y, -(t4.x+b5.x));
\n
"
;
source
<<
"real2 t11 = (real2) (t5.y+b5.y, -(t5.x+b5.x));
\n
"
;
source
<<
"real2 t12 = (real2) (t6.y+b5.y, -(t6.x+b5.x));
\n
"
;
source
<<
"data"
<<
output
<<
"[base+6*j*"
<<
m
<<
"] = b0;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(6*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
7
*
L
)
<<
"], t7-t10);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(6*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
7
*
L
)
<<
"], t9-t12);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(6*j+3)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
3
*
zsize
)
<<
"/"
<<
(
7
*
L
)
<<
"], t8+t11);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(6*j+4)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
4
*
zsize
)
<<
"/"
<<
(
7
*
L
)
<<
"], t8-t11);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(6*j+5)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
5
*
zsize
)
<<
"/"
<<
(
7
*
L
)
<<
"], t9+t12);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(6*j+6)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
6
*
zsize
)
<<
"/"
<<
(
7
*
L
)
<<
"], t7+t10);
\n
"
;
}
else
if
(
radix
==
5
)
{
source
<<
"real2 c0 = data"
<<
input
<<
"[base];
\n
"
;
source
<<
"real2 c1 = data"
<<
input
<<
"[base+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c2 = data"
<<
input
<<
"[base+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c3 = data"
<<
input
<<
"[base+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c4 = data"
<<
input
<<
"[base+"
<<
(
4
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 d0 = c1+c4;
\n
"
;
source
<<
"real2 d1 = c2+c3;
\n
"
;
source
<<
"real2 d2 = "
<<
context
.
doubleToString
(
sin
(
0.4
*
M_PI
))
<<
"*(c1-c4);
\n
"
;
...
...
@@ -116,80 +177,45 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
string
coeff
=
context
.
doubleToString
(
sin
(
0.2
*
M_PI
)
/
sin
(
0.4
*
M_PI
));
source
<<
"real2 d9 = sign*(real2) (d2.y+"
<<
coeff
<<
"*d3.y, -d2.x-"
<<
coeff
<<
"*d3.x);
\n
"
;
source
<<
"real2 d10 = sign*(real2) ("
<<
coeff
<<
"*d2.y-d3.y, d3.x-"
<<
coeff
<<
"*d2.x);
\n
"
;
source
<<
"data"
<<
output
<<
"[i+4*j*"
<<
m
<<
"] = c0+d4;
\n
"
;
source
<<
"data"
<<
output
<<
"[i+(4*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
5
*
L
)
<<
"], d7+d9);
\n
"
;
source
<<
"data"
<<
output
<<
"[i+(4*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
5
*
L
)
<<
"], d8+d10);
\n
"
;
source
<<
"data"
<<
output
<<
"[i+(4*j+3)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
3
*
zsize
)
<<
"/"
<<
(
5
*
L
)
<<
"], d8-d10);
\n
"
;
source
<<
"data"
<<
output
<<
"[i+(4*j+4)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
4
*
zsize
)
<<
"/"
<<
(
5
*
L
)
<<
"], d7-d9);
\n
"
;
source
<<
"}
\n
"
;
m
=
m
*
5
;
source
<<
"data"
<<
output
<<
"[base+4*j*"
<<
m
<<
"] = c0+d4;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(4*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
5
*
L
)
<<
"], d7+d9);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(4*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
5
*
L
)
<<
"], d8+d10);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(4*j+3)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
3
*
zsize
)
<<
"/"
<<
(
5
*
L
)
<<
"], d8-d10);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(4*j+4)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
4
*
zsize
)
<<
"/"
<<
(
5
*
L
)
<<
"], d7-d9);
\n
"
;
}
else
if
(
L
%
4
==
0
)
{
L
=
L
/
4
;
source
<<
"// Pass "
<<
(
stage
+
1
)
<<
" (radix 4)
\n
"
;
if
(
loopRequired
)
source
<<
"for (int i = get_local_id(0); i < "
<<
(
L
*
m
)
<<
"; i += get_local_size(0)) {
\n
"
;
else
{
source
<<
"if (get_local_id(0) < "
<<
(
L
*
m
)
<<
") {
\n
"
;
source
<<
"int i = get_local_id(0);
\n
"
;
}
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
source
<<
"real2 c0 = data"
<<
input
<<
"[i];
\n
"
;
source
<<
"real2 c1 = data"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c2 = data"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c3 = data"
<<
input
<<
"[i+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
else
if
(
radix
==
4
)
{
source
<<
"real2 c0 = data"
<<
input
<<
"[base];
\n
"
;
source
<<
"real2 c1 = data"
<<
input
<<
"[base+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c2 = data"
<<
input
<<
"[base+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c3 = data"
<<
input
<<
"[base+"
<<
(
3
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 d0 = c0+c2;
\n
"
;
source
<<
"real2 d1 = c0-c2;
\n
"
;
source
<<
"real2 d2 = c1+c3;
\n
"
;
source
<<
"real2 d3 = sign*(real2) (c1.y-c3.y, c3.x-c1.x);
\n
"
;
source
<<
"data"
<<
output
<<
"[i+3*j*"
<<
m
<<
"] = d0+d2;
\n
"
;
source
<<
"data"
<<
output
<<
"[i+(3*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
4
*
L
)
<<
"], d1+d3);
\n
"
;
source
<<
"data"
<<
output
<<
"[i+(3*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
4
*
L
)
<<
"], d0-d2);
\n
"
;
source
<<
"data"
<<
output
<<
"[i+(3*j+3)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
3
*
zsize
)
<<
"/"
<<
(
4
*
L
)
<<
"], d1-d3);
\n
"
;
source
<<
"}
\n
"
;
m
=
m
*
4
;
source
<<
"data"
<<
output
<<
"[base+3*j*"
<<
m
<<
"] = d0+d2;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(3*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
4
*
L
)
<<
"], d1+d3);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(3*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
4
*
L
)
<<
"], d0-d2);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(3*j+3)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
3
*
zsize
)
<<
"/"
<<
(
4
*
L
)
<<
"], d1-d3);
\n
"
;
}
else
if
(
L
%
3
==
0
)
{
L
=
L
/
3
;
source
<<
"// Pass "
<<
(
stage
+
1
)
<<
" (radix 3)
\n
"
;
if
(
loopRequired
)
source
<<
"for (int i = get_local_id(0); i < "
<<
(
L
*
m
)
<<
"; i += get_local_size(0)) {
\n
"
;
else
{
source
<<
"if (get_local_id(0) < "
<<
(
L
*
m
)
<<
") {
\n
"
;
source
<<
"int i = get_local_id(0);
\n
"
;
}
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
source
<<
"real2 c0 = data"
<<
input
<<
"[i];
\n
"
;
source
<<
"real2 c1 = data"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c2 = data"
<<
input
<<
"[i+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
else
if
(
radix
==
3
)
{
source
<<
"real2 c0 = data"
<<
input
<<
"[base];
\n
"
;
source
<<
"real2 c1 = data"
<<
input
<<
"[base+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c2 = data"
<<
input
<<
"[base+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 d0 = c1+c2;
\n
"
;
source
<<
"real2 d1 = c0-0.5f*d0;
\n
"
;
source
<<
"real2 d2 = sign*"
<<
context
.
doubleToString
(
sin
(
M_PI
/
3.0
))
<<
"*(real2) (c1.y-c2.y, c2.x-c1.x);
\n
"
;
source
<<
"data"
<<
output
<<
"[i+2*j*"
<<
m
<<
"] = c0+d0;
\n
"
;
source
<<
"data"
<<
output
<<
"[i+(2*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
3
*
L
)
<<
"], d1+d2);
\n
"
;
source
<<
"data"
<<
output
<<
"[i+(2*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
3
*
L
)
<<
"], d1-d2);
\n
"
;
source
<<
"}
\n
"
;
m
=
m
*
3
;
source
<<
"data"
<<
output
<<
"[base+2*j*"
<<
m
<<
"] = c0+d0;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(2*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
3
*
L
)
<<
"], d1+d2);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(2*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
3
*
L
)
<<
"], d1-d2);
\n
"
;
}
else
if
(
L
%
2
==
0
)
{
L
=
L
/
2
;
source
<<
"// Pass "
<<
(
stage
+
1
)
<<
" (radix 2)
\n
"
;
if
(
loopRequired
)
source
<<
"for (int i = get_local_id(0); i < "
<<
(
L
*
m
)
<<
"; i += get_local_size(0)) {
\n
"
;
else
{
source
<<
"if (get_local_id(0) < "
<<
(
L
*
m
)
<<
") {
\n
"
;
source
<<
"int i = get_local_id(0);
\n
"
;
else
if
(
radix
==
2
)
{
source
<<
"real2 c0 = data"
<<
input
<<
"[base];
\n
"
;
source
<<
"real2 c1 = data"
<<
input
<<
"[base+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"data"
<<
output
<<
"[base+j*"
<<
m
<<
"] = c0+c1;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
2
*
L
)
<<
"], c0-c1);
\n
"
;
}
source
<<
"int j = i/"
<<
m
<<
";
\n
"
;
source
<<
"real2 c0 = data"
<<
input
<<
"[i];
\n
"
;
source
<<
"real2 c1 = data"
<<
input
<<
"[i+"
<<
(
L
*
m
)
<<
"];
\n
"
;
source
<<
"data"
<<
output
<<
"[i+j*"
<<
m
<<
"] = c0+c1;
\n
"
;
source
<<
"data"
<<
output
<<
"[i+(j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
2
*
L
)
<<
"], c0-c1);
\n
"
;
source
<<
"}
\n
"
;
m
=
m
*
2
;
}
else
throw
OpenMMException
(
"Illegal size for FFT: "
+
context
.
intToString
(
zsize
));
m
=
m
*
radix
;
source
<<
"barrier(CLK_LOCAL_MEM_FENCE);
\n
"
;
source
<<
"}
\n
"
;
++
stage
;
...
...
@@ -202,20 +228,22 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
source
<<
"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[z];
\n
"
;
}
else
source
<<
"out[y*(ZSIZE*XSIZE)+get_local_id(0)*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[get_local_id(0)];
\n
"
;
source
<<
"out[y*(ZSIZE*XSIZE)+
(
get_local_id(0)
%ZSIZE)
*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[get_local_id(0)];
\n
"
;
source
<<
"barrier(CLK_GLOBAL_MEM_FENCE);"
;
map
<
string
,
string
>
replacements
;
replacements
[
"XSIZE"
]
=
context
.
intToString
(
xsize
);
replacements
[
"YSIZE"
]
=
context
.
intToString
(
ysize
);
replacements
[
"ZSIZE"
]
=
context
.
intToString
(
zsize
);
replacements
[
"BLOCKS_PER_GROUP"
]
=
context
.
intToString
(
blocksPerGroup
);
replacements
[
"M_PI"
]
=
context
.
doubleToString
(
M_PI
);
replacements
[
"COMPUTE_FFT"
]
=
source
.
str
();
replacements
[
"LOOP_REQUIRED"
]
=
(
loopRequired
?
"1"
:
"0"
);
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
fft
,
replacements
));
cl
::
Kernel
kernel
(
program
,
"execFFT"
);
int
bufferSize
=
zsize
*
(
context
.
getUseDoublePrecision
()
?
sizeof
(
mm_double2
)
:
sizeof
(
mm_float2
));
int
bufferSize
=
blocksPerGroup
*
zsize
*
(
context
.
getUseDoublePrecision
()
?
sizeof
(
mm_double2
)
:
sizeof
(
mm_float2
));
kernel
.
setArg
(
3
,
bufferSize
,
NULL
);
kernel
.
setArg
(
4
,
bufferSize
,
NULL
);
kernel
.
setArg
(
5
,
bufferSize
,
NULL
);
threads
=
(
loopRequired
?
1
:
blocksPerGroup
*
zsize
);
return
kernel
;
}
platforms/opencl/src/OpenCLFFT3D.h
View file @
93c467b2
...
...
@@ -81,8 +81,9 @@ public:
*/
static
int
findLegalDimension
(
int
minimum
);
private:
cl
::
Kernel
createKernel
(
int
xsize
,
int
ysize
,
int
zsize
);
cl
::
Kernel
createKernel
(
int
xsize
,
int
ysize
,
int
zsize
,
int
&
threads
);
int
xsize
,
ysize
,
zsize
;
int
xthreads
,
ythreads
,
zthreads
;
OpenCLContext
&
context
;
cl
::
Kernel
xkernel
,
ykernel
,
zkernel
;
};
...
...
platforms/opencl/src/OpenCLIntegrationUtilities.cpp
View file @
93c467b2
...
...
@@ -99,7 +99,7 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
random
(
NULL
),
randomSeed
(
NULL
),
randomPos
(
0
),
stepSize
(
NULL
),
ccmaAtoms
(
NULL
),
ccmaDistance
(
NULL
),
ccmaReducedMass
(
NULL
),
ccmaAtomConstraints
(
NULL
),
ccmaNumAtomConstraints
(
NULL
),
ccmaConstraintMatrixColumn
(
NULL
),
ccmaConstraintMatrixValue
(
NULL
),
ccmaDelta1
(
NULL
),
ccmaDelta2
(
NULL
),
ccmaConverged
(
NULL
),
ccmaConvergedBuffer
(
NULL
),
vsite2AvgAtoms
(
NULL
),
vsite2AvgWeights
(
NULL
),
vsite3AvgAtoms
(
NULL
),
vsite3AvgWeights
(
NULL
),
vsite2AvgAtoms
(
NULL
),
vsite2AvgWeights
(
NULL
),
vsite3AvgAtoms
(
NULL
),
vsite3AvgWeights
(
NULL
),
vsiteOutOfPlaneAtoms
(
NULL
),
vsiteOutOfPlaneWeights
(
NULL
),
hasInitializedPosConstraintKernels
(
false
),
hasInitializedVelConstraintKernels
(
false
)
{
// Create workspace arrays.
...
...
@@ -479,8 +479,6 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
ccmaNumAtomConstraints
=
OpenCLArray
::
create
<
cl_int
>
(
context
,
numAtoms
,
"CcmaAtomConstraintsIndex"
);
ccmaConstraintMatrixColumn
=
OpenCLArray
::
create
<
cl_int
>
(
context
,
numCCMA
*
maxRowElements
,
"ConstraintMatrixColumn"
);
ccmaConverged
=
OpenCLArray
::
create
<
cl_int
>
(
context
,
2
,
"CcmaConverged"
);
ccmaConvergedBuffer
=
new
cl
::
Buffer
(
context
.
getContext
(),
CL_MEM_ALLOC_HOST_PTR
,
2
*
sizeof
(
cl_int
));
ccmaConvergedMemory
=
(
cl_int
*
)
context
.
getQueue
().
enqueueMapBuffer
(
*
ccmaConvergedBuffer
,
CL_TRUE
,
CL_MAP_READ
|
CL_MAP_WRITE
,
0
,
2
*
sizeof
(
cl_int
));
vector
<
mm_int2
>
atomsVec
(
ccmaAtoms
->
getSize
());
vector
<
cl_int
>
atomConstraintsVec
(
ccmaAtomConstraints
->
getSize
());
vector
<
cl_int
>
numAtomConstraintsVec
(
ccmaNumAtomConstraints
->
getSize
());
...
...
@@ -660,24 +658,28 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
defines
[
"NUM_OUT_OF_PLANE"
]
=
context
.
intToString
(
numOutOfPlane
);
cl
::
Program
vsiteProgram
=
context
.
createProgram
(
OpenCLKernelSources
::
virtualSites
,
defines
);
vsitePositionKernel
=
cl
::
Kernel
(
vsiteProgram
,
"computeVirtualSites"
);
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
context
.
getPosq
().
getDeviceBuffer
());
setPosqCorrectionArg
(
context
,
vsitePositionKernel
,
1
);
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
vsite2AvgAtoms
->
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
vsite2AvgWeights
->
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
vsite3AvgAtoms
->
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
vsite3AvgWeights
->
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
vsiteOutOfPlaneAtoms
->
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
vsiteOutOfPlaneWeights
->
getDeviceBuffer
());
int
index
=
0
;
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
context
.
getPosq
().
getDeviceBuffer
());
if
(
context
.
getUseMixedPrecision
())
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
context
.
getPosqCorrection
().
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
vsite2AvgAtoms
->
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
vsite2AvgWeights
->
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
vsite3AvgAtoms
->
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
vsite3AvgWeights
->
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
vsiteOutOfPlaneAtoms
->
getDeviceBuffer
());
vsitePositionKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
vsiteOutOfPlaneWeights
->
getDeviceBuffer
());
vsiteForceKernel
=
cl
::
Kernel
(
vsiteProgram
,
"distributeForces"
);
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
context
.
getPosq
().
getDeviceBuffer
());
setPosqCorrectionArg
(
context
,
vsiteForceKernel
,
1
);
// Skip argument 2: the force array hasn't been created yet.
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
vsite2AvgAtoms
->
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
vsite2AvgWeights
->
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
vsite3AvgAtoms
->
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
vsite3AvgWeights
->
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
vsiteOutOfPlaneAtoms
->
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
vsiteOutOfPlaneWeights
->
getDeviceBuffer
());
index
=
0
;
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
context
.
getPosq
().
getDeviceBuffer
());
index
++
;
// Skip argument 1: the force array hasn't been created yet.
if
(
context
.
getUseMixedPrecision
())
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
context
.
getPosqCorrection
().
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
vsite2AvgAtoms
->
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
vsite2AvgWeights
->
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
vsite3AvgAtoms
->
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
vsite3AvgWeights
->
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
vsiteOutOfPlaneAtoms
->
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
index
++
,
vsiteOutOfPlaneWeights
->
getDeviceBuffer
());
numVsites
=
num2Avg
+
num3Avg
+
numOutOfPlane
;
}
...
...
@@ -718,8 +720,6 @@ OpenCLIntegrationUtilities::~OpenCLIntegrationUtilities() {
delete
ccmaDelta2
;
if
(
ccmaConverged
!=
NULL
)
delete
ccmaConverged
;
if
(
ccmaConvergedBuffer
!=
NULL
)
delete
ccmaConvergedBuffer
;
if
(
vsite2AvgAtoms
!=
NULL
)
delete
vsite2AvgAtoms
;
if
(
vsite2AvgWeights
!=
NULL
)
...
...
@@ -807,6 +807,7 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
ccmaDirectionsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
context
.
getPosqCorrection
().
getDeviceBuffer
());
else
ccmaDirectionsKernel
.
setArg
<
void
*>
(
3
,
NULL
);
ccmaDirectionsKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
ccmaConverged
->
getDeviceBuffer
());
ccmaForceKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
ccmaAtoms
->
getDeviceBuffer
());
ccmaForceKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
ccmaDistance
->
getDeviceBuffer
());
ccmaForceKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
constrainVelocities
?
context
.
getVelm
().
getDeviceBuffer
()
:
posDelta
->
getDeviceBuffer
());
...
...
@@ -834,23 +835,19 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
context
.
executeKernel
(
ccmaDirectionsKernel
,
ccmaAtoms
->
getSize
());
const
int
checkInterval
=
4
;
cl
::
Event
event
;
int
*
converged
=
(
int
*
)
context
.
getPinnedBuffer
();
for
(
int
i
=
0
;
i
<
150
;
i
++
)
{
ccmaForceKernel
.
setArg
<
cl_int
>
(
7
,
i
);
if
(
i
==
0
)
{
ccmaConvergedMemory
[
0
]
=
1
;
ccmaConvergedMemory
[
1
]
=
0
;
context
.
getQueue
().
enqueueWriteBuffer
(
ccmaConverged
->
getDeviceBuffer
(),
CL_FALSE
,
0
,
2
*
sizeof
(
cl_int
),
ccmaConvergedMemory
);
}
context
.
executeKernel
(
ccmaForceKernel
,
ccmaAtoms
->
getSize
());
if
((
i
+
1
)
%
checkInterval
==
0
)
context
.
getQueue
().
enqueueReadBuffer
(
ccmaConverged
->
getDeviceBuffer
(),
CL_FALSE
,
0
,
2
*
sizeof
(
cl_int
),
c
cmaC
onverged
Memory
,
NULL
,
&
event
);
context
.
getQueue
().
enqueueReadBuffer
(
ccmaConverged
->
getDeviceBuffer
(),
CL_FALSE
,
0
,
2
*
sizeof
(
cl_int
),
converged
,
NULL
,
&
event
);
ccmaMultiplyKernel
.
setArg
<
cl_int
>
(
5
,
i
);
context
.
executeKernel
(
ccmaMultiplyKernel
,
ccmaAtoms
->
getSize
());
ccmaUpdateKernel
.
setArg
<
cl_int
>
(
8
,
i
);
context
.
executeKernel
(
ccmaUpdateKernel
,
context
.
getNumAtoms
());
if
((
i
+
1
)
%
checkInterval
==
0
)
{
event
.
wait
();
if
(
c
cmaC
onverged
Memory
[
i
%
2
])
if
(
converged
[
i
%
2
])
break
;
}
}
...
...
@@ -864,7 +861,7 @@ void OpenCLIntegrationUtilities::computeVirtualSites() {
void
OpenCLIntegrationUtilities
::
distributeForcesFromVirtualSites
()
{
if
(
numVsites
>
0
)
{
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
context
.
getForce
().
getDeviceBuffer
());
vsiteForceKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
context
.
getForce
().
getDeviceBuffer
());
context
.
executeKernel
(
vsiteForceKernel
,
numVsites
);
}
}
...
...
platforms/opencl/src/OpenCLIntegrationUtilities.h
View file @
93c467b2
...
...
@@ -141,8 +141,6 @@ private:
OpenCLArray
*
ccmaDelta1
;
OpenCLArray
*
ccmaDelta2
;
OpenCLArray
*
ccmaConverged
;
cl
::
Buffer
*
ccmaConvergedBuffer
;
cl_int
*
ccmaConvergedMemory
;
OpenCLArray
*
vsite2AvgAtoms
;
OpenCLArray
*
vsite2AvgWeights
;
OpenCLArray
*
vsite3AvgAtoms
;
...
...
platforms/opencl/src/OpenCLKernels.cpp
View file @
93c467b2
This diff is collapsed.
Click to expand it.
platforms/opencl/src/OpenCLKernels.h
View file @
93c467b2
...
...
@@ -556,7 +556,7 @@ class OpenCLCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
OpenCLCalcNonbondedForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
OpenCLContext
&
cl
,
System
&
system
)
:
CalcNonbondedForceKernel
(
name
,
platform
),
hasInitializedKernel
(
false
),
cl
(
cl
),
sigmaEpsilon
(
NULL
),
exceptionParams
(
NULL
),
cosSinSums
(
NULL
),
pmeGrid
(
NULL
),
pmeGrid2
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeBsplineTheta
(
NULL
),
pmeBsplineDTheta
(
NULL
),
pmeGrid2
(
NULL
),
pmeBsplineModuliX
(
NULL
),
pmeBsplineModuliY
(
NULL
),
pmeBsplineModuliZ
(
NULL
),
pmeBsplineTheta
(
NULL
),
pmeAtomRange
(
NULL
),
pmeAtomGridIndex
(
NULL
),
sort
(
NULL
),
fft
(
NULL
)
{
}
~
OpenCLCalcNonbondedForceKernel
();
...
...
@@ -586,15 +586,15 @@ public:
*/
void
copyParametersToContext
(
ContextImpl
&
context
,
const
NonbondedForce
&
force
);
private:
struct
SortTrait
{
typedef
mm_int2
DataType
;
typedef
cl_int
KeyType
;
static
const
char
*
cl
DataType
()
{
return
"int2"
;}
static
const
char
*
cl
KeyType
()
{
return
"int"
;}
static
const
char
*
cl
MinKey
()
{
return
"INT_MIN"
;}
static
const
char
*
cl
MaxKey
()
{
return
"INT_MAX"
;}
static
const
char
*
cl
MaxValue
()
{
return
"(int2) (INT_MAX, INT_MAX)"
;}
static
const
char
*
cl
SortKey
()
{
return
"value.y"
;}
class
SortTrait
:
public
OpenCLSort
::
SortTrait
{
int
getDataSize
()
const
{
return
8
;}
int
getKeySize
()
const
{
return
4
;}
const
char
*
get
DataType
()
const
{
return
"int2"
;}
const
char
*
get
KeyType
()
const
{
return
"int"
;}
const
char
*
get
MinKey
()
const
{
return
"INT_MIN"
;}
const
char
*
get
MaxKey
()
const
{
return
"INT_MAX"
;}
const
char
*
get
MaxValue
()
const
{
return
"(int2) (INT_MAX, INT_MAX)"
;}
const
char
*
get
SortKey
()
const
{
return
"value.y"
;}
};
OpenCLContext
&
cl
;
bool
hasInitializedKernel
;
...
...
@@ -607,10 +607,9 @@ private:
OpenCLArray
*
pmeBsplineModuliY
;
OpenCLArray
*
pmeBsplineModuliZ
;
OpenCLArray
*
pmeBsplineTheta
;
OpenCLArray
*
pmeBsplineDTheta
;
OpenCLArray
*
pmeAtomRange
;
OpenCLArray
*
pmeAtomGridIndex
;
OpenCLSort
<
SortTrait
>
*
sort
;
OpenCLSort
*
sort
;
OpenCLFFT3D
*
fft
;
cl
::
Kernel
ewaldSumsKernel
;
cl
::
Kernel
ewaldForcesKernel
;
...
...
@@ -625,7 +624,6 @@ private:
std
::
map
<
std
::
string
,
std
::
string
>
pmeDefines
;
std
::
vector
<
std
::
pair
<
int
,
int
>
>
exceptionAtoms
;
double
ewaldSelfEnergy
,
dispersionCoefficient
,
alpha
;
int
interpolateForceThreads
;
bool
hasCoulomb
,
hasLJ
;
static
const
int
PmeOrder
=
5
;
};
...
...
@@ -775,6 +773,8 @@ private:
std
::
vector
<
bool
>
pairValueUsesParam
,
pairEnergyUsesParam
,
pairEnergyUsesValue
;
System
&
system
;
cl
::
Kernel
pairValueKernel
,
perParticleValueKernel
,
pairEnergyKernel
,
perParticleEnergyKernel
,
gradientChainRuleKernel
;
std
::
string
pairValueSrc
,
pairEnergySrc
;
std
::
map
<
std
::
string
,
std
::
string
>
pairValueDefines
,
pairEnergyDefines
;
};
/**
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
93c467b2
This diff is collapsed.
Click to expand it.
platforms/opencl/src/OpenCLNonbondedUtilities.h
View file @
93c467b2
This diff is collapsed.
Click to expand it.
platforms/opencl/src/OpenCLParallelKernels.cpp
View file @
93c467b2
...
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2011-201
3
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -108,7 +108,7 @@ private:
};
OpenCLParallelCalcForcesAndEnergyKernel
::
OpenCLParallelCalcForcesAndEnergyKernel
(
string
name
,
const
Platform
&
platform
,
OpenCLPlatform
::
PlatformData
&
data
)
:
CalcForcesAndEnergyKernel
(
name
,
platform
),
data
(
data
),
completionTimes
(
data
.
contexts
.
size
()),
context
Tile
s
(
data
.
contexts
.
size
()),
contextForces
(
NULL
),
CalcForcesAndEnergyKernel
(
name
,
platform
),
data
(
data
),
completionTimes
(
data
.
contexts
.
size
()),
context
NonbondedFraction
s
(
data
.
contexts
.
size
()),
contextForces
(
NULL
),
pinnedPositionBuffer
(
NULL
),
pinnedPositionMemory
(
NULL
),
pinnedForceBuffer
(
NULL
),
pinnedForceMemory
(
NULL
)
{
for
(
int
i
=
0
;
i
<
(
int
)
data
.
contexts
.
size
();
i
++
)
kernels
.
push_back
(
Kernel
(
new
OpenCLCalcForcesAndEnergyKernel
(
name
,
platform
,
*
data
.
contexts
[
i
])));
...
...
@@ -126,6 +126,8 @@ OpenCLParallelCalcForcesAndEnergyKernel::~OpenCLParallelCalcForcesAndEnergyKerne
void
OpenCLParallelCalcForcesAndEnergyKernel
::
initialize
(
const
System
&
system
)
{
for
(
int
i
=
0
;
i
<
(
int
)
kernels
.
size
();
i
++
)
getKernel
(
i
).
initialize
(
system
);
for
(
int
i
=
0
;
i
<
(
int
)
contextNonbondedFractions
.
size
();
i
++
)
contextNonbondedFractions
[
i
]
=
1
/
(
double
)
contextNonbondedFractions
.
size
();
}
void
OpenCLParallelCalcForcesAndEnergyKernel
::
beginComputation
(
ContextImpl
&
context
,
bool
includeForce
,
bool
includeEnergy
,
int
groups
)
{
...
...
@@ -172,30 +174,26 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
numAtoms
*
(
data
.
contexts
.
size
()
-
1
)
*
elementSize
,
pinnedForceMemory
);
cl
.
reduceBuffer
(
*
contextForces
,
data
.
contexts
.
size
());
// Balance work between the contexts by transferring a
few
nonbonded
tiles
from the context that
// Balance work between the contexts by transferring a
little
nonbonded
work
from the context that
// finished last to the one that finished first.
int
firstIndex
=
0
,
lastIndex
=
0
;
int
totalTiles
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
completionTimes
.
size
();
i
++
)
{
if
(
completionTimes
[
i
]
<
completionTimes
[
firstIndex
])
firstIndex
=
i
;
if
(
completionTimes
[
i
]
>
completionTimes
[
lastIndex
])
lastIndex
=
i
;
contextTiles
[
i
]
=
data
.
contexts
[
i
]
->
getNonbondedUtilities
().
getNumTiles
();
totalTiles
+=
contextTiles
[
i
];
}
int
tilesToTransfer
=
totalTiles
/
1000
;
if
(
tilesToTransfer
<
1
)
tilesToTransfer
=
1
;
if
(
tilesToTransfer
>
contextTiles
[
lastIndex
])
tilesToTransfer
=
contextTiles
[
lastIndex
];
contextTiles
[
firstIndex
]
+=
tilesToTransfer
;
contextTiles
[
lastIndex
]
-=
tilesToTransfer
;
int
startIndex
=
0
;
for
(
int
i
=
0
;
i
<
(
int
)
contextTiles
.
size
();
i
++
)
{
data
.
contexts
[
i
]
->
getNonbondedUtilities
().
setTileRange
(
startIndex
,
contextTiles
[
i
]);
startIndex
+=
contextTiles
[
i
];
}
double
fractionToTransfer
=
min
(
0.001
,
contextNonbondedFractions
[
lastIndex
]);
contextNonbondedFractions
[
firstIndex
]
+=
fractionToTransfer
;
contextNonbondedFractions
[
lastIndex
]
-=
fractionToTransfer
;
double
startFraction
=
0.0
;
for
(
int
i
=
0
;
i
<
(
int
)
contextNonbondedFractions
.
size
();
i
++
)
{
double
endFraction
=
startFraction
+
contextNonbondedFractions
[
i
];
if
(
i
==
contextNonbondedFractions
.
size
()
-
1
)
endFraction
=
1.0
;
// Avoid roundoff error
data
.
contexts
[
i
]
->
getNonbondedUtilities
().
setAtomBlockRange
(
startFraction
,
endFraction
);
startFraction
=
endFraction
;
}
}
return
energy
;
...
...
platforms/opencl/src/OpenCLParallelKernels.h
View file @
93c467b2
...
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011 Stanford University and the Authors.
*
* Portions copyright (c) 2011
-2013
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -80,7 +80,7 @@ private:
OpenCLPlatform
::
PlatformData
&
data
;
std
::
vector
<
Kernel
>
kernels
;
std
::
vector
<
long
long
>
completionTimes
;
std
::
vector
<
int
>
context
Tile
s
;
std
::
vector
<
double
>
context
NonbondedFraction
s
;
OpenCLArray
*
contextForces
;
cl
::
Buffer
*
pinnedPositionBuffer
;
cl
::
Buffer
*
pinnedForceBuffer
;
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment