Commit c992d2c9 authored by peastman's avatar peastman Committed by GitHub
Browse files

Merge pull request #1839 from peastman/sum

Improved performance of computing sums with CustomIntegrator
parents 3a356e24 ffddfb86
......@@ -1497,7 +1497,7 @@ private:
CudaContext& cu;
double energy;
float energyFloat;
int numGlobalVariables;
int numGlobalVariables, sumWorkGroupSize;
bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs;
mutable bool localValuesAreCurrent;
CudaArray* globalValues;
......
......@@ -7146,10 +7146,11 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
stepTarget.resize(numSteps);
merged.resize(numSteps, false);
modifiesParameters = false;
sumWorkGroupSize = 512;
map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
defines["WORK_GROUP_SIZE"] = cu.intToString(CudaContext::ThreadBlockSize);
defines["WORK_GROUP_SIZE"] = cu.intToString(sumWorkGroupSize);
defines["SUM_BUFFER_SIZE"] = "0";
// Record the tabulated functions.
......@@ -7695,7 +7696,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
cu.executeKernel(randomKernel, &randomArgs[0], numAtoms);
cu.clearBuffer(*sumBuffer);
cu.executeKernel(kernels[step][0], &kernelArgs[step][0][0], numAtoms, 128);
cu.executeKernel(kernels[step][1], &kernelArgs[step][1][0], CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize);
cu.executeKernel(kernels[step][1], &kernelArgs[step][1][0], sumWorkGroupSize, sumWorkGroupSize);
if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
double value;
summedValue->download(&value);
......@@ -7797,7 +7798,7 @@ double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context,
cu.clearBuffer(*sumBuffer);
cu.executeKernel(kineticEnergyKernel, &kineticEnergyArgs[0], cu.getNumAtoms());
void* args[] = {&sumBuffer->getDevicePointer(), &summedValue->getDevicePointer()};
cu.executeKernel(sumKineticEnergyKernel, args, CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize);
cu.executeKernel(sumKineticEnergyKernel, args, sumWorkGroupSize, sumWorkGroupSize);
if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
double ke;
summedValue->download(&ke);
......
......@@ -1484,7 +1484,7 @@ private:
OpenCLContext& cl;
double energy;
float energyFloat;
int numGlobalVariables;
int numGlobalVariables, sumWorkGroupSize;
bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs;
mutable bool localValuesAreCurrent;
OpenCLArray* globalValues;
......
......@@ -7491,9 +7491,12 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
stepTarget.resize(numSteps);
merged.resize(numSteps, false);
modifiesParameters = false;
sumWorkGroupSize = cl.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
if (sumWorkGroupSize > 512)
sumWorkGroupSize = 512;
map<string, string> defines;
defines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
defines["WORK_GROUP_SIZE"] = cl.intToString(OpenCLContext::ThreadBlockSize);
defines["WORK_GROUP_SIZE"] = cl.intToString(sumWorkGroupSize);
// Record the tabulated functions.
......@@ -8037,7 +8040,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
cl.executeKernel(randomKernel, numAtoms);
cl.clearBuffer(*sumBuffer);
cl.executeKernel(kernels[step][0], numAtoms, 128);
cl.executeKernel(kernels[step][1], OpenCLContext::ThreadBlockSize, OpenCLContext::ThreadBlockSize);
cl.executeKernel(kernels[step][1], sumWorkGroupSize, sumWorkGroupSize);
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
double value;
summedValue->download(&value);
......@@ -8139,7 +8142,7 @@ double OpenCLIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& contex
kineticEnergyKernel.setArg<cl::Buffer>(8, cl.getIntegrationUtilities().getRandom().getDeviceBuffer());
kineticEnergyKernel.setArg<cl_uint>(9, 0);
cl.executeKernel(kineticEnergyKernel, cl.getNumAtoms());
cl.executeKernel(sumKineticEnergyKernel, OpenCLContext::ThreadBlockSize, OpenCLContext::ThreadBlockSize);
cl.executeKernel(sumKineticEnergyKernel, sumWorkGroupSize, sumWorkGroupSize);
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
double ke;
summedValue->download(&ke);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment