Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
8d206abc
Commit
8d206abc
authored
Aug 26, 2009
by
Peter Eastman
Browse files
Replaced CUDPP with Imran's code for stream compaction.
parent
cc5c1492
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
278 additions
and
68 deletions
+278
-68
platforms/cuda/src/kernels/cudaCompact.cu
platforms/cuda/src/kernels/cudaCompact.cu
+221
-0
platforms/cuda/src/kernels/cudaCompact.h
platforms/cuda/src/kernels/cudaCompact.h
+44
-0
platforms/cuda/src/kernels/gpu.cpp
platforms/cuda/src/kernels/gpu.cpp
+4
-14
platforms/cuda/src/kernels/gputypes.h
platforms/cuda/src/kernels/gputypes.h
+2
-2
platforms/cuda/src/kernels/kCalculateCDLJForces.cu
platforms/cuda/src/kernels/kCalculateCDLJForces.cu
+3
-22
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
+2
-15
platforms/cuda/src/kernels/kCalculateCustomNonbondedForces.cu
...forms/cuda/src/kernels/kCalculateCustomNonbondedForces.cu
+2
-15
No files found.
platforms/cuda/src/kernels/cudaCompact.cu
0 → 100644
View file @
8d206abc
/* Code for CUDA stream compaction. Roughly based on:
Billeter M, Olsson O, Assarsson U. Efficient Stream Compaction on Wide SIMD Many-Core Architectures.
High Performance Graphics 2009.
Notes:
- paper recommends 128 threads/block, so this is hard coded.
- I only implement the prefix-sum based compact primitive, and not the POPC one, as that is more
complicated and performs poorly on current hardware
- I only implement the scattered- and staged-write variant of phase III as it they have reasonable
performance across most of the tested workloads in the paper. The selective variant is not
implemented.
- The prefix sum of per-block element counts (phase II) is not done in a particularly efficient
manner. It is, however, done in a very easy to program manner, and integrated into the top of
phase III, reducing the number of kernel invocations required. If one wanted to use existing code,
it'd be easy to take the CUDA SDK scanLargeArray sample, and do a prefix sum over dgBlockCounts in
a phase II kernel. You could also adapt the existing prescan128 to take an initial value, and scan
dgBlockCounts in stages.
Date: 23 Aug 2009
Author: Imran Haque (ihaque@cs.stanford.edu)
Affiliation: Stanford University
License: Public Domain
*/
#include "cudaCompact.h"
typedef
unsigned
int
T
;
// Phase 1: Count valid elements per thread block
// Hard-code 128 thd/blk
__device__
unsigned
int
sumReduce128
(
unsigned
int
*
arr
)
{
// Parallel reduce element counts
// Assumes 128 thd/block
if
(
threadIdx
.
x
<
64
)
arr
[
threadIdx
.
x
]
+=
arr
[
threadIdx
.
x
+
64
];
__syncthreads
();
if
(
threadIdx
.
x
<
32
)
{
arr
[
threadIdx
.
x
]
+=
arr
[
threadIdx
.
x
+
32
];
if
(
threadIdx
.
x
<
16
)
arr
[
threadIdx
.
x
]
+=
arr
[
threadIdx
.
x
+
16
];
if
(
threadIdx
.
x
<
8
)
arr
[
threadIdx
.
x
]
+=
arr
[
threadIdx
.
x
+
8
];
if
(
threadIdx
.
x
<
4
)
arr
[
threadIdx
.
x
]
+=
arr
[
threadIdx
.
x
+
4
];
if
(
threadIdx
.
x
<
2
)
arr
[
threadIdx
.
x
]
+=
arr
[
threadIdx
.
x
+
2
];
if
(
threadIdx
.
x
<
1
)
arr
[
threadIdx
.
x
]
+=
arr
[
threadIdx
.
x
+
1
];
}
__syncthreads
();
return
arr
[
0
];
}
__global__
void
countElts
(
unsigned
int
*
dgBlockCounts
,
const
unsigned
int
*
dgValid
,
const
size_t
eltsPerBlock
,
const
size_t
len
)
{
__shared__
unsigned
int
dsCount
[
128
];
dsCount
[
threadIdx
.
x
]
=
0
;
size_t
ub
;
ub
=
(
len
<
(
blockIdx
.
x
+
1
)
*
eltsPerBlock
)
?
len
:
((
blockIdx
.
x
+
1
)
*
eltsPerBlock
);
for
(
int
base
=
blockIdx
.
x
*
eltsPerBlock
;
base
<
(
blockIdx
.
x
+
1
)
*
eltsPerBlock
;
base
+=
blockDim
.
x
)
{
if
((
base
+
threadIdx
.
x
)
<
ub
&&
dgValid
[
base
+
threadIdx
.
x
])
dsCount
[
threadIdx
.
x
]
++
;
}
__syncthreads
();
unsigned
int
blockCount
=
sumReduce128
(
dsCount
);
if
(
threadIdx
.
x
==
0
)
dgBlockCounts
[
blockIdx
.
x
]
=
blockCount
;
return
;
}
// Phase 2/3: Move valid elements using SIMD compaction (phase 2 is done implicitly at top of __global__ method)
// Exclusive prefix scan over 128 elements
// Assumes 128 threads
// Taken from cuda SDK "scan" sample for naive scan, with small modifications
__device__
int
exclusivePrescan128
(
const
unsigned
int
*
in
,
unsigned
int
*
outAndTemp
)
{
const
int
n
=
128
;
//TODO: this temp storage could be reduced since we write to shared memory in out anyway, and n is hardcoded
//__shared__ int temp[2*n];
unsigned
int
*
temp
=
outAndTemp
;
int
pout
=
1
,
pin
=
0
;
// load input into temp
// This is exclusive scan, so shift right by one and set first elt to 0
temp
[
pout
*
n
+
threadIdx
.
x
]
=
(
threadIdx
.
x
>
0
)
?
in
[
threadIdx
.
x
-
1
]
:
0
;
__syncthreads
();
for
(
int
offset
=
1
;
offset
<
n
;
offset
*=
2
)
{
pout
=
1
-
pout
;
// swap double buffer indices
pin
=
1
-
pout
;
__syncthreads
();
temp
[
pout
*
n
+
threadIdx
.
x
]
=
temp
[
pin
*
n
+
threadIdx
.
x
];
if
(
threadIdx
.
x
>=
offset
)
temp
[
pout
*
n
+
threadIdx
.
x
]
+=
temp
[
pin
*
n
+
threadIdx
.
x
-
offset
];
}
//out[threadIdx.x] = temp[pout*n+threadIdx.x]; // write output
__syncthreads
();
return
outAndTemp
[
127
]
+
in
[
127
];
// Return sum of all elements
}
__device__
int
compactSIMDPrefixSum
(
const
T
*
dsData
,
const
unsigned
int
*
dsValid
,
T
*
dsCompact
)
{
__shared__
unsigned
int
dsLocalIndex
[
256
];
int
numValid
=
exclusivePrescan128
(
dsValid
,
dsLocalIndex
);
if
(
dsValid
[
threadIdx
.
x
])
dsCompact
[
dsLocalIndex
[
threadIdx
.
x
]]
=
dsData
[
threadIdx
.
x
];
return
numValid
;
}
__global__
void
moveValidElementsStaged
(
const
T
*
dgData
,
T
*
dgCompact
,
const
unsigned
int
*
dgValid
,
const
unsigned
int
*
dgBlockCounts
,
size_t
eltsPerBlock
,
size_t
len
,
size_t
*
dNumValidElements
)
{
__shared__
T
inBlock
[
128
];
__shared__
unsigned
int
validBlock
[
128
];
__shared__
T
compactBlock
[
128
];
int
blockOutOffset
=
0
;
// Sum up the blockCounts before us to find our offset
// This is totally inefficient - lots of repeated work b/w blocks, and uneven balancing.
// Paper implements this as a prefix sum kernel in phase II
// May still be faster than an extra kernel invocation?
for
(
int
base
=
0
;
base
<
blockIdx
.
x
;
base
+=
blockDim
.
x
)
{
// Load up the count of valid elements for each block before us in batches of 128
if
((
base
+
threadIdx
.
x
)
<
blockIdx
.
x
)
{
validBlock
[
threadIdx
.
x
]
=
dgBlockCounts
[
base
+
threadIdx
.
x
];
}
else
{
validBlock
[
threadIdx
.
x
]
=
0
;
}
__syncthreads
();
// Parallel reduce these counts
// Accumulate in the final offset variable
blockOutOffset
+=
sumReduce128
(
validBlock
);
}
size_t
ub
;
ub
=
(
len
<
(
blockIdx
.
x
+
1
)
*
eltsPerBlock
)
?
len
:
((
blockIdx
.
x
+
1
)
*
eltsPerBlock
);
for
(
int
base
=
blockIdx
.
x
*
eltsPerBlock
;
base
<
(
blockIdx
.
x
+
1
)
*
eltsPerBlock
;
base
+=
blockDim
.
x
)
{
if
((
base
+
threadIdx
.
x
)
<
ub
)
{
validBlock
[
threadIdx
.
x
]
=
dgValid
[
base
+
threadIdx
.
x
];
inBlock
[
threadIdx
.
x
]
=
dgData
[
base
+
threadIdx
.
x
];
}
else
{
validBlock
[
threadIdx
.
x
]
=
0
;
}
__syncthreads
();
int
numValidBlock
=
compactSIMDPrefixSum
(
inBlock
,
validBlock
,
compactBlock
);
__syncthreads
();
if
(
threadIdx
.
x
<
numValidBlock
)
{
dgCompact
[
blockOutOffset
+
threadIdx
.
x
]
=
compactBlock
[
threadIdx
.
x
];
}
blockOutOffset
+=
numValidBlock
;
}
if
(
blockIdx
.
x
==
(
gridDim
.
x
-
1
)
&&
threadIdx
.
x
==
0
)
{
*
dNumValidElements
=
blockOutOffset
;
}
}
__global__
void
moveValidElementsScattered
(
const
T
*
dgData
,
T
*
dgCompact
,
const
unsigned
int
*
dgValid
,
const
unsigned
int
*
dgBlockCounts
,
size_t
eltsPerBlock
,
size_t
len
,
size_t
*
dNumValidElements
)
{
__shared__
T
inBlock
[
128
];
__shared__
unsigned
int
validBlock
[
128
];
T
*
compactBlock
=
dgCompact
;
size_t
blockOutOffset
=
0
;
// Sum up the blockCounts before us to find our offset
// This is totally inefficient - lots of repeated work b/w blocks, and uneven balancing.
// Paper implements this as a prefix sum kernel in phase II
// May still be faster than an extra kernel invocation?
for
(
int
base
=
0
;
base
<
blockIdx
.
x
;
base
+=
blockDim
.
x
)
{
// Load up the count of valid elements for each block before us in batches of 128
if
((
base
+
threadIdx
.
x
)
<
blockIdx
.
x
)
{
validBlock
[
threadIdx
.
x
]
=
dgBlockCounts
[
base
+
threadIdx
.
x
];
}
else
{
validBlock
[
threadIdx
.
x
]
=
0
;
}
__syncthreads
();
// Parallel reduce these counts
// Accumulate in the final offset variable
blockOutOffset
+=
sumReduce128
(
validBlock
);
}
compactBlock
+=
blockOutOffset
;
size_t
ub
;
ub
=
(
len
<
(
blockIdx
.
x
+
1
)
*
eltsPerBlock
)
?
len
:
((
blockIdx
.
x
+
1
)
*
eltsPerBlock
);
for
(
int
base
=
blockIdx
.
x
*
eltsPerBlock
;
base
<
(
blockIdx
.
x
+
1
)
*
eltsPerBlock
;
base
+=
blockDim
.
x
)
{
if
((
base
+
threadIdx
.
x
)
<
ub
)
{
validBlock
[
threadIdx
.
x
]
=
dgValid
[
base
+
threadIdx
.
x
];
inBlock
[
threadIdx
.
x
]
=
dgData
[
base
+
threadIdx
.
x
];
}
else
{
validBlock
[
threadIdx
.
x
]
=
0
;
}
__syncthreads
();
int
numValidBlock
=
compactSIMDPrefixSum
(
inBlock
,
validBlock
,
compactBlock
);
blockOutOffset
+=
numValidBlock
;
compactBlock
+=
numValidBlock
;
}
if
(
blockIdx
.
x
==
(
gridDim
.
x
-
1
)
&&
threadIdx
.
x
==
0
)
{
*
dNumValidElements
=
blockOutOffset
;
}
}
void
planCompaction
(
compactionPlan
&
d
,
bool
stageOutput
)
{
cudaDeviceProp
deviceProp
;
cudaGetDeviceProperties
(
&
deviceProp
,
0
);
d
.
nThreadBlocks
=
16
*
deviceProp
.
multiProcessorCount
;
cudaMalloc
((
void
**
)
&
(
d
.
dgBlockCounts
),
d
.
nThreadBlocks
*
sizeof
(
unsigned
int
));
d
.
stageOutput
=
stageOutput
;
// TODO: make sure allocation worked
d
.
valid
=
true
;
}
void
destroyCompactionPlan
(
compactionPlan
&
d
)
{
if
(
d
.
valid
)
cudaFree
(
d
.
dgBlockCounts
);
}
int
compactStream
(
const
compactionPlan
&
d
,
T
*
dOut
,
const
T
*
dIn
,
const
unsigned
int
*
dValid
,
size_t
len
,
size_t
*
dNumValid
)
{
if
(
!
d
.
valid
)
{
return
-
1
;
}
// Figure out # elements per block
unsigned
int
numBlocks
=
d
.
nThreadBlocks
;
if
(
numBlocks
*
128
>
len
)
numBlocks
=
(
len
+
127
)
/
128
;
const
size_t
eltsPerBlock
=
len
/
numBlocks
+
((
len
%
numBlocks
)
?
1
:
0
);
// TODO: implement loop over blocks of 10M
// Phase 1: Calculate number of valid elements per thread block
countElts
<<<
numBlocks
,
128
>>>
(
d
.
dgBlockCounts
,
dValid
,
eltsPerBlock
,
len
);
// Phase 2/3: Move valid elements using SIMD compaction
if
(
d
.
stageOutput
)
{
moveValidElementsStaged
<<<
numBlocks
,
128
>>>
(
dIn
,
dOut
,
dValid
,
d
.
dgBlockCounts
,
eltsPerBlock
,
len
,
dNumValid
);
}
else
{
moveValidElementsScattered
<<<
numBlocks
,
128
>>>
(
dIn
,
dOut
,
dValid
,
d
.
dgBlockCounts
,
eltsPerBlock
,
len
,
dNumValid
);
}
return
0
;
}
platforms/cuda/src/kernels/cudaCompact.h
0 → 100644
View file @
8d206abc
#ifndef __OPENMM_CUDACOMPACT_H__
#define __OPENMM_CUDACOMPACT_H__
/* Code for CUDA stream compaction. Roughly based on:
Billeter M, Olsson O, Assarsson U. Efficient Stream Compaction on Wide SIMD Many-Core Architectures.
High Performance Graphics 2009.
Notes:
- paper recommends 128 threads/block, so this is hard coded.
- I only implement the prefix-sum based compact primitive, and not the POPC one, as that is more
complicated and performs poorly on current hardware
- I only implement the scattered- and staged-write variant of phase III as it they have reasonable
performance across most of the tested workloads in the paper. The selective variant is not
implemented.
- The prefix sum of per-block element counts (phase II) is not done in a particularly efficient
manner. It is, however, done in a very easy to program manner, and integrated into the top of
phase III, reducing the number of kernel invocations required. If one wanted to use existing code,
it'd be easy to take the CUDA SDK scanLargeArray sample, and do a prefix sum over dgBlockCounts in
a phase II kernel. You could also adapt the existing prescan128 to take an initial value, and scan
dgBlockCounts in stages.
Date: 23 Aug 2009
Author: Imran Haque (ihaque@cs.stanford.edu)
Affiliation: Stanford University
License: Public Domain
*/
struct
compactionPlan
{
bool
valid
;
unsigned
int
*
dgBlockCounts
;
unsigned
int
nThreadBlocks
;
bool
stageOutput
;
};
extern
"C"
void
planCompaction
(
compactionPlan
&
d
,
bool
stageOutput
=
true
);
extern
"C"
void
destroyCompactionPlan
(
compactionPlan
&
d
);
extern
"C"
int
compactStream
(
const
compactionPlan
&
d
,
unsigned
int
*
dOut
,
const
unsigned
int
*
dIn
,
const
unsigned
int
*
dValid
,
size_t
len
,
size_t
*
dNumValid
);
#endif // __OPENMM_CUDACOMPACT_H__
\ No newline at end of file
platforms/cuda/src/kernels/gpu.cpp
View file @
8d206abc
...
@@ -1801,8 +1801,8 @@ void gpuShutDown(gpuContext gpu)
...
@@ -1801,8 +1801,8 @@ void gpuShutDown(gpuContext gpu)
for
(
int
i
=
0
;
i
<
MAX_TABULATED_FUNCTIONS
;
i
++
)
for
(
int
i
=
0
;
i
<
MAX_TABULATED_FUNCTIONS
;
i
++
)
if
(
gpu
->
tabulatedFunctions
[
i
].
coefficients
!=
NULL
)
if
(
gpu
->
tabulatedFunctions
[
i
].
coefficients
!=
NULL
)
delete
gpu
->
tabulatedFunctions
[
i
].
coefficients
;
delete
gpu
->
tabulatedFunctions
[
i
].
coefficients
;
if
(
gpu
->
c
udpp
!=
0
)
if
(
gpu
->
c
ompactPlan
.
valid
)
cudppDestroy
Plan
(
gpu
->
c
udpp
);
destroyCompaction
Plan
(
gpu
->
c
ompactPlan
);
// Wrap up
// Wrap up
delete
gpu
;
delete
gpu
;
...
@@ -1930,18 +1930,8 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
...
@@ -1930,18 +1930,8 @@ int gpuBuildThreadBlockWorkList(gpuContext gpu)
gpu
->
sim
.
bornForce2_workBlock
=
gpu
->
sim
.
bornForce2_threads_per_block
/
GRID
;
gpu
->
sim
.
bornForce2_workBlock
=
gpu
->
sim
.
bornForce2_threads_per_block
/
GRID
;
gpu
->
sim
.
workUnits
=
cells
;
gpu
->
sim
.
workUnits
=
cells
;
// Initialize the CUDPP workspace.
// Initialize the plan for doing stream compaction.
gpu
->
cudpp
=
0
;
planCompaction
(
gpu
->
compactPlan
);
CUDPPConfiguration
config
;
config
.
datatype
=
CUDPP_UINT
;
config
.
algorithm
=
CUDPP_COMPACT
;
config
.
options
=
CUDPP_OPTION_FORWARD
;
CUDPPResult
result
=
cudppPlan
(
&
gpu
->
cudpp
,
config
,
cells
,
1
,
0
);
if
(
CUDPP_SUCCESS
!=
result
)
{
printf
(
"Error initializing CUDPP: %d
\n
"
,
result
);
exit
(
-
1
);
}
// Increase block count if necessary for extra large molecules that would
// Increase block count if necessary for extra large molecules that would
// otherwise overflow the SM workunit buffers
// otherwise overflow the SM workunit buffers
...
...
platforms/cuda/src/kernels/gputypes.h
View file @
8d206abc
...
@@ -28,7 +28,7 @@
...
@@ -28,7 +28,7 @@
* -------------------------------------------------------------------------- */
* -------------------------------------------------------------------------- */
#include "cudatypes.h"
#include "cudatypes.h"
#include "cud
pp
.h"
#include "cud
aCompact
.h"
#include <vector>
#include <vector>
struct
gpuAtomType
{
struct
gpuAtomType
{
...
@@ -87,7 +87,7 @@ struct _gpuContext {
...
@@ -87,7 +87,7 @@ struct _gpuContext {
bool
bIncludeGBSA
;
bool
bIncludeGBSA
;
unsigned
long
seed
;
unsigned
long
seed
;
SM_VERSION
sm_version
;
SM_VERSION
sm_version
;
CUDPPHandle
cudpp
;
compactionPlan
compactPlan
;
CUDAStream
<
float4
>*
psPosq4
;
CUDAStream
<
float4
>*
psPosq4
;
CUDAStream
<
float4
>*
psPosqP4
;
CUDAStream
<
float4
>*
psPosqP4
;
CUDAStream
<
float4
>*
psOldPosq4
;
CUDAStream
<
float4
>*
psOldPosq4
;
...
...
platforms/cuda/src/kernels/kCalculateCDLJForces.cu
View file @
8d206abc
...
@@ -124,7 +124,6 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
...
@@ -124,7 +124,6 @@ void GetCalculateCDLJForcesSim(gpuContext gpu)
void
kCalculateCDLJForces
(
gpuContext
gpu
)
void
kCalculateCDLJForces
(
gpuContext
gpu
)
{
{
// printf("kCalculateCDLJCutoffForces\n");
// printf("kCalculateCDLJCutoffForces\n");
CUDPPResult
result
;
switch
(
gpu
->
sim
.
nonbondedMethod
)
switch
(
gpu
->
sim
.
nonbondedMethod
)
{
{
case
NO_CUTOFF
:
case
NO_CUTOFF
:
...
@@ -141,13 +140,7 @@ void kCalculateCDLJForces(gpuContext gpu)
...
@@ -141,13 +140,7 @@ void kCalculateCDLJForces(gpuContext gpu)
LAUNCHERROR
(
"kFindBlockBoundsCutoff"
);
LAUNCHERROR
(
"kFindBlockBoundsCutoff"
);
kFindBlocksWithInteractionsCutoff_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
kFindBlocksWithInteractionsCutoff_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindBlocksWithInteractionsCutoff"
);
LAUNCHERROR
(
"kFindBlocksWithInteractionsCutoff"
);
result
=
cudppCompact
(
gpu
->
cudpp
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pInteractionCount
,
compactStream
(
gpu
->
compactPlan
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
pInteractionCount
);
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
);
if
(
result
!=
CUDPP_SUCCESS
)
{
printf
(
"Error in cudppCompact: %d
\n
"
,
result
);
exit
(
-
1
);
}
kFindInteractionsWithinBlocksCutoff_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
kFindInteractionsWithinBlocksCutoff_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
if
(
gpu
->
bOutputBufferPerWarp
)
if
(
gpu
->
bOutputBufferPerWarp
)
...
@@ -163,13 +156,7 @@ void kCalculateCDLJForces(gpuContext gpu)
...
@@ -163,13 +156,7 @@ void kCalculateCDLJForces(gpuContext gpu)
LAUNCHERROR
(
"kFindBlockBoundsPeriodic"
);
LAUNCHERROR
(
"kFindBlockBoundsPeriodic"
);
kFindBlocksWithInteractionsPeriodic_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
kFindBlocksWithInteractionsPeriodic_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindBlocksWithInteractionsPeriodic"
);
LAUNCHERROR
(
"kFindBlocksWithInteractionsPeriodic"
);
result
=
cudppCompact
(
gpu
->
cudpp
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pInteractionCount
,
compactStream
(
gpu
->
compactPlan
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
pInteractionCount
);
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
);
if
(
result
!=
CUDPP_SUCCESS
)
{
printf
(
"Error in cudppCompact: %d
\n
"
,
result
);
exit
(
-
1
);
}
kFindInteractionsWithinBlocksPeriodic_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
kFindInteractionsWithinBlocksPeriodic_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
if
(
gpu
->
bOutputBufferPerWarp
)
if
(
gpu
->
bOutputBufferPerWarp
)
...
@@ -185,13 +172,7 @@ void kCalculateCDLJForces(gpuContext gpu)
...
@@ -185,13 +172,7 @@ void kCalculateCDLJForces(gpuContext gpu)
LAUNCHERROR
(
"kFindBlockBoundsEwaldDirect"
);
LAUNCHERROR
(
"kFindBlockBoundsEwaldDirect"
);
kFindBlocksWithInteractionsEwaldDirect_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
kFindBlocksWithInteractionsEwaldDirect_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindBlocksWithInteractionsEwaldDirect"
);
LAUNCHERROR
(
"kFindBlocksWithInteractionsEwaldDirect"
);
result
=
cudppCompact
(
gpu
->
cudpp
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pInteractionCount
,
compactStream
(
gpu
->
compactPlan
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
pInteractionCount
);
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
);
if
(
result
!=
CUDPP_SUCCESS
)
{
printf
(
"Error in cudppCompact: %d
\n
"
,
result
);
exit
(
-
1
);
}
kFindInteractionsWithinBlocksEwaldDirect_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
kFindInteractionsWithinBlocksEwaldDirect_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
if
(
gpu
->
bOutputBufferPerWarp
)
if
(
gpu
->
bOutputBufferPerWarp
)
...
...
platforms/cuda/src/kernels/kCalculateCDLJObcGbsaForces1.cu
View file @
8d206abc
...
@@ -111,7 +111,6 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
...
@@ -111,7 +111,6 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
// check if Born radii need to be calculated
// check if Born radii need to be calculated
kClearBornForces
(
gpu
);
kClearBornForces
(
gpu
);
CUDPPResult
result
;
switch
(
gpu
->
sim
.
nonbondedMethod
)
switch
(
gpu
->
sim
.
nonbondedMethod
)
{
{
case
NO_CUTOFF
:
case
NO_CUTOFF
:
...
@@ -133,13 +132,7 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
...
@@ -133,13 +132,7 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
LAUNCHERROR
(
"kFindBlockBoundsCutoff"
);
LAUNCHERROR
(
"kFindBlockBoundsCutoff"
);
kFindBlocksWithInteractionsCutoff_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
kFindBlocksWithInteractionsCutoff_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindBlocksWithInteractionsCutoff"
);
LAUNCHERROR
(
"kFindBlocksWithInteractionsCutoff"
);
result
=
cudppCompact
(
gpu
->
cudpp
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pInteractionCount
,
compactStream
(
gpu
->
compactPlan
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
pInteractionCount
);
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
);
if
(
result
!=
CUDPP_SUCCESS
)
{
printf
(
"Error in cudppCompact: %d
\n
"
,
result
);
exit
(
-
1
);
}
kFindInteractionsWithinBlocksCutoff_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
kFindInteractionsWithinBlocksCutoff_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
if
(
gpu
->
bRecalculateBornRadii
)
if
(
gpu
->
bRecalculateBornRadii
)
...
@@ -160,13 +153,7 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
...
@@ -160,13 +153,7 @@ void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
LAUNCHERROR
(
"kFindBlockBoundsPeriodic"
);
LAUNCHERROR
(
"kFindBlockBoundsPeriodic"
);
kFindBlocksWithInteractionsPeriodic_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
kFindBlocksWithInteractionsPeriodic_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindBlocksWithInteractionsPeriodic"
);
LAUNCHERROR
(
"kFindBlocksWithInteractionsPeriodic"
);
result
=
cudppCompact
(
gpu
->
cudpp
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pInteractionCount
,
compactStream
(
gpu
->
compactPlan
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
pInteractionCount
);
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
);
if
(
result
!=
CUDPP_SUCCESS
)
{
printf
(
"Error in cudppCompact: %d
\n
"
,
result
);
exit
(
-
1
);
}
kFindInteractionsWithinBlocksPeriodic_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
kFindInteractionsWithinBlocksPeriodic_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
if
(
gpu
->
bRecalculateBornRadii
)
if
(
gpu
->
bRecalculateBornRadii
)
...
...
platforms/cuda/src/kernels/kCalculateCustomNonbondedForces.cu
View file @
8d206abc
...
@@ -439,7 +439,6 @@ __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int* workU
...
@@ -439,7 +439,6 @@ __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int* workU
void
kCalculateCustomNonbondedForces
(
gpuContext
gpu
,
bool
neighborListValid
)
void
kCalculateCustomNonbondedForces
(
gpuContext
gpu
,
bool
neighborListValid
)
{
{
// printf("kCalculateCustomNonbondedCutoffForces\n");
// printf("kCalculateCustomNonbondedCutoffForces\n");
CUDPPResult
result
;
int
sharedPerThread
=
sizeof
(
Atom
)
+
gpu
->
sim
.
customExpressionStackSize
*
sizeof
(
float
);
int
sharedPerThread
=
sizeof
(
Atom
)
+
gpu
->
sim
.
customExpressionStackSize
*
sizeof
(
float
);
if
(
gpu
->
sim
.
customNonbondedMethod
!=
NO_CUTOFF
)
if
(
gpu
->
sim
.
customNonbondedMethod
!=
NO_CUTOFF
)
sharedPerThread
+=
sizeof
(
float3
);
sharedPerThread
+=
sizeof
(
float3
);
...
@@ -466,13 +465,7 @@ void kCalculateCustomNonbondedForces(gpuContext gpu, bool neighborListValid)
...
@@ -466,13 +465,7 @@ void kCalculateCustomNonbondedForces(gpuContext gpu, bool neighborListValid)
LAUNCHERROR
(
"kFindBlockBoundsCutoff"
);
LAUNCHERROR
(
"kFindBlockBoundsCutoff"
);
kFindBlocksWithInteractionsCutoff_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
kFindBlocksWithInteractionsCutoff_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindBlocksWithInteractionsCutoff"
);
LAUNCHERROR
(
"kFindBlocksWithInteractionsCutoff"
);
result
=
cudppCompact
(
gpu
->
cudpp
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pInteractionCount
,
compactStream
(
gpu
->
compactPlan
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
pInteractionCount
);
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
);
if
(
result
!=
CUDPP_SUCCESS
)
{
printf
(
"Error in cudppCompact: %d
\n
"
,
result
);
exit
(
-
1
);
}
kFindInteractionsWithinBlocksCutoff_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
kFindInteractionsWithinBlocksCutoff_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
}
}
...
@@ -492,13 +485,7 @@ void kCalculateCustomNonbondedForces(gpuContext gpu, bool neighborListValid)
...
@@ -492,13 +485,7 @@ void kCalculateCustomNonbondedForces(gpuContext gpu, bool neighborListValid)
LAUNCHERROR
(
"kFindBlockBoundsPeriodic"
);
LAUNCHERROR
(
"kFindBlockBoundsPeriodic"
);
kFindBlocksWithInteractionsPeriodic_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
kFindBlocksWithInteractionsPeriodic_kernel
<<<
gpu
->
sim
.
interaction_blocks
,
gpu
->
sim
.
interaction_threads_per_block
>>>
();
LAUNCHERROR
(
"kFindBlocksWithInteractionsPeriodic"
);
LAUNCHERROR
(
"kFindBlocksWithInteractionsPeriodic"
);
result
=
cudppCompact
(
gpu
->
cudpp
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pInteractionCount
,
compactStream
(
gpu
->
compactPlan
,
gpu
->
sim
.
pInteractingWorkUnit
,
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
,
gpu
->
sim
.
pInteractionCount
);
gpu
->
sim
.
pWorkUnit
,
gpu
->
sim
.
pInteractionFlag
,
gpu
->
sim
.
workUnits
);
if
(
result
!=
CUDPP_SUCCESS
)
{
printf
(
"Error in cudppCompact: %d
\n
"
,
result
);
exit
(
-
1
);
}
kFindInteractionsWithinBlocksPeriodic_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
kFindInteractionsWithinBlocksPeriodic_kernel
<<<
gpu
->
sim
.
nonbond_blocks
,
gpu
->
sim
.
nonbond_threads_per_block
,
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
sizeof
(
unsigned
int
)
*
gpu
->
sim
.
nonbond_threads_per_block
>>>
(
gpu
->
sim
.
pInteractingWorkUnit
);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment