Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
5ab3c394
Commit
5ab3c394
authored
Sep 12, 2016
by
Peter Eastman
Browse files
Minor optimizations
parent
9e34d39e
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
40 additions
and
48 deletions
+40
-48
platforms/cuda/src/CudaKernels.cpp
platforms/cuda/src/CudaKernels.cpp
+13
-13
platforms/cuda/src/CudaSort.cpp
platforms/cuda/src/CudaSort.cpp
+2
-2
platforms/cuda/src/kernels/sort.cu
platforms/cuda/src/kernels/sort.cu
+10
-14
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+2
-2
platforms/opencl/src/OpenCLSort.cpp
platforms/opencl/src/OpenCLSort.cpp
+3
-2
platforms/opencl/src/kernels/sort.cl
platforms/opencl/src/kernels/sort.cl
+10
-15
No files found.
platforms/cuda/src/CudaKernels.cpp
View file @
5ab3c394
...
...
@@ -2002,7 +2002,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0 || cu.getPlatformData().deterministicForces) {
void* finishSpreadArgs[] = {&directPmeGrid->getDevicePointer()};
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, directPmeGrid->getSize());
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, directPmeGrid->getSize()
, 256
);
}
if (useCudaFFT) {
...
...
@@ -2019,13 +2019,13 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), usePmeStream ? &pmeEnergyBuffer->getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs,
cu.getNumAtoms()
);
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs,
gridSizeX*gridSizeY*gridSizeZ
);
}
void* convolutionArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeConvolutionKernel, convolutionArgs,
cu.getNumAtoms()
);
cu.executeKernel(pmeConvolutionKernel, convolutionArgs,
gridSizeX*gridSizeY*gridSizeZ, 256
);
if (useCudaFFT) {
if (cu.getUseDoublePrecision())
...
...
@@ -6443,7 +6443,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel1, args1, numAtoms);
cu.executeKernel(kernel1, args1, numAtoms
, 128
);
// Apply constraints.
...
...
@@ -6453,7 +6453,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
void* args2[] = {&numAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms);
cu.executeKernel(kernel2, args2, numAtoms
, 128
);
integration.computeVirtualSites();
// Update the time and step count.
...
...
@@ -6526,7 +6526,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
¶ms->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
cu.executeKernel(kernel1, args1, numAtoms);
cu.executeKernel(kernel1, args1, numAtoms
, 128
);
// Apply constraints.
...
...
@@ -6537,7 +6537,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args2[] = {&numAtoms, &cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms);
cu.executeKernel(kernel2, args2, numAtoms
, 128
);
integration.computeVirtualSites();
// Update the time and step count.
...
...
@@ -6588,7 +6588,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
useDouble ? (void*) &noise : (void*) &noiseFloat,
&cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
cu.executeKernel(kernel1, args1, numAtoms);
cu.executeKernel(kernel1, args1, numAtoms
, 128
);
// Apply constraints.
...
...
@@ -6599,7 +6599,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args2[] = {&numAtoms, useDouble ? (void*) &stepSize : (void*) &stepSizeFloat,
&cu.getPosq().getDevicePointer(), &posCorrection, &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms);
cu.executeKernel(kernel2, args2, numAtoms
, 128
);
integration.computeVirtualSites();
// Update the time and step count.
...
...
@@ -6652,7 +6652,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel1, args1, numAtoms);
cu.executeKernel(kernel1, args1, numAtoms
, 128
);
// Apply constraints.
...
...
@@ -6662,7 +6662,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
void* args2[] = {&numAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms);
cu.executeKernel(kernel2, args2, numAtoms
, 128
);
integration.computeVirtualSites();
// Update the time and step count.
...
...
@@ -6738,7 +6738,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
¶ms->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
cu.executeKernel(kernel1, args1, numAtoms);
cu.executeKernel(kernel1, args1, numAtoms
, 128
);
// Apply constraints.
...
...
@@ -6749,7 +6749,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args2[] = {&numAtoms, &cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms);
cu.executeKernel(kernel2, args2, numAtoms
, 128
);
integration.computeVirtualSites();
// Update the time and step count.
...
...
platforms/cuda/src/CudaSort.cpp
View file @
5ab3c394
...
...
@@ -114,13 +114,13 @@ void CudaSort::sort(CudaArray& data) {
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
void
*
rangeArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
,
&
dataRange
->
getDevicePointer
(),
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
context
.
executeKernel
(
computeRangeKernel
,
rangeArgs
,
rangeKernelSize
,
rangeKernelSize
,
rangeKernelSize
*
trait
->
getKeySize
());
context
.
executeKernel
(
computeRangeKernel
,
rangeArgs
,
rangeKernelSize
,
rangeKernelSize
,
2
*
rangeKernelSize
*
trait
->
getKeySize
());
// Assign array elements to buckets.
void
*
elementsArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataLength
,
&
numBuckets
,
&
dataRange
->
getDevicePointer
(),
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
assignElementsKernel
,
elementsArgs
,
data
.
getSize
());
context
.
executeKernel
(
assignElementsKernel
,
elementsArgs
,
data
.
getSize
()
,
128
);
// Compute the position of each bucket.
...
...
platforms/cuda/src/kernels/sort.cu
View file @
5ab3c394
...
...
@@ -52,7 +52,8 @@ __global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length)
*/
__global__
void
computeRange
(
const
DATA_TYPE
*
__restrict__
data
,
unsigned
int
length
,
KEY_TYPE
*
__restrict__
range
,
unsigned
int
numBuckets
,
unsigned
int
*
__restrict__
bucketOffset
)
{
extern
__shared__
KEY_TYPE
rangeBuffer
[];
extern
__shared__
KEY_TYPE
minBuffer
[];
KEY_TYPE
*
maxBuffer
=
minBuffer
+
blockDim
.
x
;
KEY_TYPE
minimum
=
MAX_KEY
;
KEY_TYPE
maximum
=
MIN_KEY
;
...
...
@@ -66,23 +67,18 @@ __global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int le
// Now reduce them.
rangeBuffer
[
threadIdx
.
x
]
=
minimum
;
minBuffer
[
threadIdx
.
x
]
=
minimum
;
maxBuffer
[
threadIdx
.
x
]
=
maximum
;
__syncthreads
();
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
threadIdx
.
x
+
step
<
blockDim
.
x
&&
threadIdx
.
x
%
(
2
*
step
)
==
0
)
rangeBuffer
[
threadIdx
.
x
]
=
min
(
rangeBuffer
[
threadIdx
.
x
],
rangeBuffer
[
threadIdx
.
x
+
step
]);
__syncthreads
();
}
minimum
=
rangeBuffer
[
0
];
__syncthreads
();
rangeBuffer
[
threadIdx
.
x
]
=
maximum
;
__syncthreads
();
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
threadIdx
.
x
+
step
<
blockDim
.
x
&&
threadIdx
.
x
%
(
2
*
step
)
==
0
)
rangeBuffer
[
threadIdx
.
x
]
=
max
(
rangeBuffer
[
threadIdx
.
x
],
rangeBuffer
[
threadIdx
.
x
+
step
]);
if
(
threadIdx
.
x
+
step
<
blockDim
.
x
&&
threadIdx
.
x
%
(
2
*
step
)
==
0
)
{
minBuffer
[
threadIdx
.
x
]
=
min
(
minBuffer
[
threadIdx
.
x
],
minBuffer
[
threadIdx
.
x
+
step
]);
maxBuffer
[
threadIdx
.
x
]
=
max
(
maxBuffer
[
threadIdx
.
x
],
maxBuffer
[
threadIdx
.
x
+
step
]);
}
__syncthreads
();
}
maximum
=
rangeBuffer
[
0
];
minimum
=
minBuffer
[
0
];
maximum
=
maxBuffer
[
0
];
if
(
threadIdx
.
x
==
0
)
{
range
[
0
]
=
minimum
;
range
[
1
]
=
maximum
;
...
...
platforms/opencl/src/OpenCLKernels.cpp
View file @
5ab3c394
...
...
@@ -2086,8 +2086,8 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeEvalEnergyKernel.setArg<mm_float4>(7, recipBoxVectorsFloat[2]);
}
if (includeEnergy)
cl.executeKernel(pmeEvalEnergyKernel,
cl.getNumAtoms()
);
cl.executeKernel(pmeConvolutionKernel,
cl.getNumAtoms()
);
cl.executeKernel(pmeEvalEnergyKernel,
gridSizeX*gridSizeY*gridSizeZ
);
cl.executeKernel(pmeConvolutionKernel,
gridSizeX*gridSizeY*gridSizeZ
);
fft->execFFT(*pmeGrid2, *pmeGrid, false);
setPeriodicBoxArgs(cl, pmeInterpolateForceKernel, 3);
if (cl.getUseDoublePrecision()) {
...
...
platforms/opencl/src/OpenCLSort.cpp
View file @
5ab3c394
...
...
@@ -124,8 +124,9 @@ void OpenCLSort::sort(OpenCLArray& data) {
computeRangeKernel
.
setArg
<
cl_uint
>
(
1
,
data
.
getSize
());
computeRangeKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
dataRange
->
getDeviceBuffer
());
computeRangeKernel
.
setArg
(
3
,
rangeKernelSize
*
trait
->
getKeySize
(),
NULL
);
computeRangeKernel
.
setArg
<
cl_int
>
(
4
,
numBuckets
);
computeRangeKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
bucketOffset
->
getDeviceBuffer
());
computeRangeKernel
.
setArg
(
4
,
rangeKernelSize
*
trait
->
getKeySize
(),
NULL
);
computeRangeKernel
.
setArg
<
cl_int
>
(
5
,
numBuckets
);
computeRangeKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
bucketOffset
->
getDeviceBuffer
());
context
.
executeKernel
(
computeRangeKernel
,
rangeKernelSize
,
rangeKernelSize
);
// Assign array elements to buckets.
...
...
platforms/opencl/src/kernels/sort.cl
View file @
5ab3c394
...
...
@@ -49,8 +49,8 @@ __kernel void sortShortList(__global DATA_TYPE* __restrict__ data, uint length,
*
Calculate
the
minimum
and
maximum
value
in
the
array
to
be
sorted.
This
kernel
*
is
executed
as
a
single
work
group.
*/
__kernel
void
computeRange
(
__global
const
DATA_TYPE*
restrict
data,
uint
length,
__global
KEY_TYPE*
restrict
range,
__local
KEY_TYPE*
restrict
b
uffer,
uint
numBuckets,
__global
uint*
restrict
bucketOffset
)
{
__kernel
void
computeRange
(
__global
const
DATA_TYPE*
restrict
data,
uint
length,
__global
KEY_TYPE*
restrict
range,
__local
KEY_TYPE*
restrict
minB
uffer,
__local
KEY_TYPE*
restrict
maxBuffer,
uint
numBuckets,
__global
uint*
restrict
bucketOffset
)
{
KEY_TYPE
minimum
=
MAX_KEY
;
KEY_TYPE
maximum
=
MIN_KEY
;
...
...
@@ -64,23 +64,18 @@ __kernel void computeRange(__global const DATA_TYPE* restrict data, uint length,
//
Now
reduce
them.
buffer[get_local_id
(
0
)
]
=
minimum
;
minBuffer[get_local_id
(
0
)
]
=
minimum
;
maxBuffer[get_local_id
(
0
)
]
=
maximum
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
uint
step
=
1
; step < get_local_size(0); step *= 2) {
if
(
get_local_id
(
0
)
+step
<
get_local_size
(
0
)
&&
get_local_id
(
0
)
%
(
2*step
)
==
0
)
buffer[get_local_id
(
0
)
]
=
min
(
buffer[get_local_id
(
0
)
],
buffer[get_local_id
(
0
)
+step]
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
minimum
=
buffer[0]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
buffer[get_local_id
(
0
)
]
=
maximum
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
uint
step
=
1
; step < get_local_size(0); step *= 2) {
if
(
get_local_id
(
0
)
+step
<
get_local_size
(
0
)
&&
get_local_id
(
0
)
%
(
2*step
)
==
0
)
buffer[get_local_id
(
0
)
]
=
max
(
buffer[get_local_id
(
0
)
],
buffer[get_local_id
(
0
)
+step]
)
;
if
(
get_local_id
(
0
)
+step
<
get_local_size
(
0
)
&&
get_local_id
(
0
)
%
(
2*step
)
==
0
)
{
minBuffer[get_local_id
(
0
)
]
=
min
(
minBuffer[get_local_id
(
0
)
],
minBuffer[get_local_id
(
0
)
+step]
)
;
maxBuffer[get_local_id
(
0
)
]
=
max
(
maxBuffer[get_local_id
(
0
)
],
maxBuffer[get_local_id
(
0
)
+step]
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
maximum
=
buffer[0]
;
minimum
=
minBuffer[0]
;
maximum
=
maxBuffer[0]
;
if
(
get_local_id
(
0
)
==
0
)
{
range[0]
=
minimum
;
range[1]
=
maximum
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment