Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
e19cefde
Commit
e19cefde
authored
Oct 22, 2014
by
peastman
Browse files
Merge pull request #665 from peastman/980
Workaround for driver bugs on GTX 980
parents
ba66e90e
be863b08
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
34 additions
and
26 deletions
+34
-26
platforms/cuda/include/CudaKernels.h
platforms/cuda/include/CudaKernels.h
+1
-1
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+22
-14
platforms/opencl/src/kernels/sort.cl
platforms/opencl/src/kernels/sort.cl
+11
-11
No files found.
platforms/cuda/include/CudaKernels.h
View file @
e19cefde
...
@@ -632,7 +632,7 @@ private:
...
@@ -632,7 +632,7 @@ private:
std
::
vector
<
std
::
pair
<
int
,
int
>
>
exceptionAtoms
;
std
::
vector
<
std
::
pair
<
int
,
int
>
>
exceptionAtoms
;
double
ewaldSelfEnergy
,
dispersionCoefficient
,
alpha
;
double
ewaldSelfEnergy
,
dispersionCoefficient
,
alpha
;
int
interpolateForceThreads
;
int
interpolateForceThreads
;
bool
hasCoulomb
,
hasLJ
;
bool
hasCoulomb
,
hasLJ
,
usePmeStream
;
static
const
int
PmeOrder
=
5
;
static
const
int
PmeOrder
=
5
;
};
};
...
...
platforms/cuda/src/CudaKernels.cpp
View file @
e19cefde
...
@@ -1457,8 +1457,10 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
...
@@ -1457,8 +1457,10 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
if (hasInitializedFFT) {
if (hasInitializedFFT) {
cufftDestroy(fftForward);
cufftDestroy(fftForward);
cufftDestroy(fftBackward);
cufftDestroy(fftBackward);
cuStreamDestroy
(
pmeStream
);
if (usePmeStream) {
cuEventDestroy
(
pmeSyncEvent
);
cuStreamDestroy(pmeStream);
cuEventDestroy(pmeSyncEvent);
}
}
}
}
}
...
@@ -1670,15 +1672,18 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
...
@@ -1670,15 +1672,18 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
// Prepare for doing PME on its own stream.
// Prepare for doing PME on its own stream.
cuStreamCreate
(
&
pmeStream
,
CU_STREAM_NON_BLOCKING
);
usePmeStream = (cu.getComputeCapability() < 5.0); // A driver bug causes this to be very slow on GTX 980.
cufftSetStream
(
fftForward
,
pmeStream
);
if (usePmeStream) {
cufftSetStream
(
fftBackward
,
pmeStream
);
cuStreamCreate(&pmeStream, CU_STREAM_NON_BLOCKING);
CHECK_RESULT
(
cuEventCreate
(
&
pmeSyncEvent
,
CU_EVENT_DISABLE_TIMING
),
"Error creating event for NonbondedForce"
);
cufftSetStream(fftForward, pmeStream);
int
recipForceGroup
=
force
.
getReciprocalSpaceForceGroup
();
cufftSetStream(fftBackward, pmeStream);
if
(
recipForceGroup
<
0
)
CHECK_RESULT(cuEventCreate(&pmeSyncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for NonbondedForce");
recipForceGroup
=
force
.
getForceGroup
();
int recipForceGroup = force.getReciprocalSpaceForceGroup();
cu
.
addPreComputation
(
new
SyncStreamPreComputation
(
pmeStream
,
pmeSyncEvent
,
recipForceGroup
));
if (recipForceGroup < 0)
cu
.
addPostComputation
(
new
SyncStreamPostComputation
(
pmeSyncEvent
,
recipForceGroup
));
recipForceGroup = force.getForceGroup();
cu.addPreComputation(new SyncStreamPreComputation(pmeStream, pmeSyncEvent, recipForceGroup));
cu.addPostComputation(new SyncStreamPostComputation(pmeSyncEvent, recipForceGroup));
}
hasInitializedFFT = true;
hasInitializedFFT = true;
// Initialize the b-spline moduli.
// Initialize the b-spline moduli.
...
@@ -1795,7 +1800,8 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1795,7 +1800,8 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
}
}
if (directPmeGrid != NULL && includeReciprocal) {
if (directPmeGrid != NULL && includeReciprocal) {
cu
.
setCurrentStream
(
pmeStream
);
if (usePmeStream)
cu.setCurrentStream(pmeStream);
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
...
@@ -1832,8 +1838,10 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
...
@@ -1832,8 +1838,10 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(),
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &pmeAtomGridIndex->getDevicePointer()};
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &pmeAtomGridIndex->getDevicePointer()};
cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
cuEventRecord
(
pmeSyncEvent
,
pmeStream
);
if (usePmeStream) {
cu
.
restoreDefaultStream
();
cuEventRecord(pmeSyncEvent, pmeStream);
cu.restoreDefaultStream();
}
}
}
double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
if (dispersionCoefficient != 0.0 && includeDirect) {
if (dispersionCoefficient != 0.0 && includeDirect) {
...
...
platforms/opencl/src/kernels/sort.cl
View file @
e19cefde
...
@@ -162,10 +162,10 @@ __kernel void copyDataToBuckets(__global const DATA_TYPE* restrict data, __globa
...
@@ -162,10 +162,10 @@ __kernel void copyDataToBuckets(__global const DATA_TYPE* restrict data, __globa
*
Sort
the
data
in
each
bucket.
*
Sort
the
data
in
each
bucket.
*/
*/
__kernel
void
sortBuckets
(
__global
DATA_TYPE*
restrict
data,
__global
const
DATA_TYPE*
restrict
buckets,
uint
numBuckets,
__global
const
uint*
restrict
bucketOffset,
__local
DATA_TYPE*
restrict
buffer
)
{
__kernel
void
sortBuckets
(
__global
DATA_TYPE*
restrict
data,
__global
const
DATA_TYPE*
restrict
buckets,
uint
numBuckets,
__global
const
uint*
restrict
bucketOffset,
__local
DATA_TYPE*
restrict
buffer
)
{
for
(
u
int
index
=
get_group_id
(
0
)
; index < numBuckets; index += get_num_groups(0)) {
for
(
int
index
=
get_group_id
(
0
)
; index < numBuckets; index += get_num_groups(0)) {
u
int
startIndex
=
(
index
==
0
?
0
:
bucketOffset[index-1]
)
;
int
startIndex
=
(
index
==
0
?
0
:
bucketOffset[index-1]
)
;
u
int
endIndex
=
bucketOffset[index]
;
int
endIndex
=
bucketOffset[index]
;
u
int
length
=
endIndex-startIndex
;
int
length
=
endIndex-startIndex
;
if
(
length
<=
get_local_size
(
0
))
{
if
(
length
<=
get_local_size
(
0
))
{
//
Load
the
data
into
local
memory.
//
Load
the
data
into
local
memory.
...
@@ -177,8 +177,8 @@ __kernel void sortBuckets(__global DATA_TYPE* restrict data, __global const DATA
...
@@ -177,8 +177,8 @@ __kernel void sortBuckets(__global DATA_TYPE* restrict data, __global const DATA
//
Perform
a
bitonic
sort
in
local
memory.
//
Perform
a
bitonic
sort
in
local
memory.
for
(
u
int
k
=
2
; k <= get_local_size(0); k *= 2) {
for
(
int
k
=
2
; k <= get_local_size(0); k *= 2) {
for
(
u
int
j
=
k/2
; j > 0; j /= 2) {
for
(
int
j
=
k/2
; j > 0; j /= 2) {
int
ixj
=
get_local_id
(
0
)
^j
;
int
ixj
=
get_local_id
(
0
)
^j
;
if
(
ixj
>
get_local_id
(
0
))
{
if
(
ixj
>
get_local_id
(
0
))
{
DATA_TYPE
value1
=
buffer[get_local_id
(
0
)
]
;
DATA_TYPE
value1
=
buffer[get_local_id
(
0
)
]
;
...
@@ -203,21 +203,21 @@ __kernel void sortBuckets(__global DATA_TYPE* restrict data, __global const DATA
...
@@ -203,21 +203,21 @@ __kernel void sortBuckets(__global DATA_TYPE* restrict data, __global const DATA
else
{
else
{
//
Copy
the
bucket
data
over
to
the
output
array.
//
Copy
the
bucket
data
over
to
the
output
array.
for
(
u
int
i
=
get_local_id
(
0
)
; i < length; i += get_local_size(0))
for
(
int
i
=
get_local_id
(
0
)
; i < length; i += get_local_size(0))
data[startIndex+i]
=
buckets[startIndex+i]
;
data[startIndex+i]
=
buckets[startIndex+i]
;
barrier
(
CLK_GLOBAL_MEM_FENCE
)
;
barrier
(
CLK_GLOBAL_MEM_FENCE
)
;
//
Perform
a
bitonic
sort
in
global
memory.
//
Perform
a
bitonic
sort
in
global
memory.
for
(
u
int
k
=
2
; k < 2*length; k *= 2) {
for
(
int
k
=
2
; k < 2*length; k *= 2) {
for
(
u
int
j
=
k/2
; j > 0; j /= 2) {
for
(
int
j
=
k/2
; j > 0; j /= 2) {
for
(
u
int
i
=
get_local_id
(
0
)
; i < length; i += get_local_size(0)) {
for
(
int
i
=
get_local_id
(
0
)
; i < length; i += get_local_size(0)) {
int
ixj
=
i^j
;
int
ixj
=
i^j
;
if
(
ixj
>
i
&&
ixj
<
length
)
{
if
(
ixj
>
i
&&
ixj
<
length
)
{
DATA_TYPE
value1
=
data[startIndex+i]
;
DATA_TYPE
value1
=
data[startIndex+i]
;
DATA_TYPE
value2
=
data[startIndex+ixj]
;
DATA_TYPE
value2
=
data[startIndex+ixj]
;
bool
ascending
=
((
i&k
)
==
0
)
;
bool
ascending
=
((
i&k
)
==
0
)
;
for
(
u
int
mask
=
k*2
; mask < 2*length; mask *= 2)
for
(
int
mask
=
k*2
; mask < 2*length; mask *= 2)
ascending
=
((
i&mask
)
==
0
?
!ascending
:
ascending
)
;
ascending
=
((
i&mask
)
==
0
?
!ascending
:
ascending
)
;
KEY_TYPE
lowKey
=
(
ascending
?
getValue
(
value1
)
:
getValue
(
value2
))
;
KEY_TYPE
lowKey
=
(
ascending
?
getValue
(
value1
)
:
getValue
(
value2
))
;
KEY_TYPE
highKey
=
(
ascending
?
getValue
(
value2
)
:
getValue
(
value1
))
;
KEY_TYPE
highKey
=
(
ascending
?
getValue
(
value2
)
:
getValue
(
value1
))
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment