Commit c992d2c9 authored by peastman's avatar peastman Committed by GitHub
Browse files

Merge pull request #1839 from peastman/sum

Improved performance of computing sums with CustomIntegrator
parents 3a356e24 ffddfb86
...@@ -1497,7 +1497,7 @@ private: ...@@ -1497,7 +1497,7 @@ private:
CudaContext& cu; CudaContext& cu;
double energy; double energy;
float energyFloat; float energyFloat;
int numGlobalVariables; int numGlobalVariables, sumWorkGroupSize;
bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs; bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs;
mutable bool localValuesAreCurrent; mutable bool localValuesAreCurrent;
CudaArray* globalValues; CudaArray* globalValues;
......
...@@ -7146,10 +7146,11 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context, ...@@ -7146,10 +7146,11 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
stepTarget.resize(numSteps); stepTarget.resize(numSteps);
merged.resize(numSteps, false); merged.resize(numSteps, false);
modifiesParameters = false; modifiesParameters = false;
sumWorkGroupSize = 512;
map<string, string> defines; map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms()); defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
defines["WORK_GROUP_SIZE"] = cu.intToString(CudaContext::ThreadBlockSize); defines["WORK_GROUP_SIZE"] = cu.intToString(sumWorkGroupSize);
defines["SUM_BUFFER_SIZE"] = "0"; defines["SUM_BUFFER_SIZE"] = "0";
// Record the tabulated functions. // Record the tabulated functions.
...@@ -7695,7 +7696,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -7695,7 +7696,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
cu.executeKernel(randomKernel, &randomArgs[0], numAtoms); cu.executeKernel(randomKernel, &randomArgs[0], numAtoms);
cu.clearBuffer(*sumBuffer); cu.clearBuffer(*sumBuffer);
cu.executeKernel(kernels[step][0], &kernelArgs[step][0][0], numAtoms, 128); cu.executeKernel(kernels[step][0], &kernelArgs[step][0][0], numAtoms, 128);
cu.executeKernel(kernels[step][1], &kernelArgs[step][1][0], CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize); cu.executeKernel(kernels[step][1], &kernelArgs[step][1][0], sumWorkGroupSize, sumWorkGroupSize);
if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) { if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
double value; double value;
summedValue->download(&value); summedValue->download(&value);
...@@ -7797,7 +7798,7 @@ double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context, ...@@ -7797,7 +7798,7 @@ double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context,
cu.clearBuffer(*sumBuffer); cu.clearBuffer(*sumBuffer);
cu.executeKernel(kineticEnergyKernel, &kineticEnergyArgs[0], cu.getNumAtoms()); cu.executeKernel(kineticEnergyKernel, &kineticEnergyArgs[0], cu.getNumAtoms());
void* args[] = {&sumBuffer->getDevicePointer(), &summedValue->getDevicePointer()}; void* args[] = {&sumBuffer->getDevicePointer(), &summedValue->getDevicePointer()};
cu.executeKernel(sumKineticEnergyKernel, args, CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize); cu.executeKernel(sumKineticEnergyKernel, args, sumWorkGroupSize, sumWorkGroupSize);
if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) { if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
double ke; double ke;
summedValue->download(&ke); summedValue->download(&ke);
......
...@@ -1484,7 +1484,7 @@ private: ...@@ -1484,7 +1484,7 @@ private:
OpenCLContext& cl; OpenCLContext& cl;
double energy; double energy;
float energyFloat; float energyFloat;
int numGlobalVariables; int numGlobalVariables, sumWorkGroupSize;
bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs; bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs;
mutable bool localValuesAreCurrent; mutable bool localValuesAreCurrent;
OpenCLArray* globalValues; OpenCLArray* globalValues;
......
...@@ -7491,9 +7491,12 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context ...@@ -7491,9 +7491,12 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
stepTarget.resize(numSteps); stepTarget.resize(numSteps);
merged.resize(numSteps, false); merged.resize(numSteps, false);
modifiesParameters = false; modifiesParameters = false;
sumWorkGroupSize = cl.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
if (sumWorkGroupSize > 512)
sumWorkGroupSize = 512;
map<string, string> defines; map<string, string> defines;
defines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms()); defines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
defines["WORK_GROUP_SIZE"] = cl.intToString(OpenCLContext::ThreadBlockSize); defines["WORK_GROUP_SIZE"] = cl.intToString(sumWorkGroupSize);
// Record the tabulated functions. // Record the tabulated functions.
...@@ -8037,7 +8040,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr ...@@ -8037,7 +8040,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
cl.executeKernel(randomKernel, numAtoms); cl.executeKernel(randomKernel, numAtoms);
cl.clearBuffer(*sumBuffer); cl.clearBuffer(*sumBuffer);
cl.executeKernel(kernels[step][0], numAtoms, 128); cl.executeKernel(kernels[step][0], numAtoms, 128);
cl.executeKernel(kernels[step][1], OpenCLContext::ThreadBlockSize, OpenCLContext::ThreadBlockSize); cl.executeKernel(kernels[step][1], sumWorkGroupSize, sumWorkGroupSize);
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) { if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
double value; double value;
summedValue->download(&value); summedValue->download(&value);
...@@ -8139,7 +8142,7 @@ double OpenCLIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& contex ...@@ -8139,7 +8142,7 @@ double OpenCLIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& contex
kineticEnergyKernel.setArg<cl::Buffer>(8, cl.getIntegrationUtilities().getRandom().getDeviceBuffer()); kineticEnergyKernel.setArg<cl::Buffer>(8, cl.getIntegrationUtilities().getRandom().getDeviceBuffer());
kineticEnergyKernel.setArg<cl_uint>(9, 0); kineticEnergyKernel.setArg<cl_uint>(9, 0);
cl.executeKernel(kineticEnergyKernel, cl.getNumAtoms()); cl.executeKernel(kineticEnergyKernel, cl.getNumAtoms());
cl.executeKernel(sumKineticEnergyKernel, OpenCLContext::ThreadBlockSize, OpenCLContext::ThreadBlockSize); cl.executeKernel(sumKineticEnergyKernel, sumWorkGroupSize, sumWorkGroupSize);
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) { if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
double ke; double ke;
summedValue->download(&ke); summedValue->download(&ke);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment