"csrc/vscode:/vscode.git/clone" did not exist on "4dd1e68ac81c8fb63243bcfbbcf942eae5243210"
Commit 8cda1b79 authored by Peter Eastman's avatar Peter Eastman
Browse files

Finished implementing mixed precision model

parent 9ad85ebd
...@@ -378,7 +378,8 @@ void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) { ...@@ -378,7 +378,8 @@ void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
CudaIntegrationUtilities& integration = cu.getIntegrationUtilities(); CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
cu.clearBuffer(integration.getPosDelta()); cu.clearBuffer(integration.getPosDelta());
integration.applyConstraints(tol); integration.applyConstraints(tol);
void* args[] = {&cu.getPosq().getDevicePointer(), &cu.getIntegrationUtilities().getPosDelta().getDevicePointer()}; CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args[] = {&cu.getPosq().getDevicePointer(), &posCorrection, &cu.getIntegrationUtilities().getPosDelta().getDevicePointer()};
cu.executeKernel(applyDeltasKernel, args, cu.getNumAtoms()); cu.executeKernel(applyDeltasKernel, args, cu.getNumAtoms());
integration.computeVirtualSites(); integration.computeVirtualSites();
} }
...@@ -4144,12 +4145,13 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni ...@@ -4144,12 +4145,13 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
float stepSizeFloat = (float) stepSize; float stepSizeFloat = (float) stepSize;
float tauDtFloat = (float) tauDt; float tauDtFloat = (float) tauDt;
float noiseFloat = (float) noise; float noiseFloat = (float) noise;
bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();
// Call the first integration kernel. // Call the first integration kernel.
int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms()); int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
void* args1[] = {cu.getUseDoublePrecision() ? (void*) &tauDt : (void*) &tauDtFloat, void* args1[] = {useDouble ? (void*) &tauDt : (void*) &tauDtFloat,
cu.getUseDoublePrecision() ? (void*) &noise : (void*) &noiseFloat, useDouble ? (void*) &noise : (void*) &noiseFloat,
&cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex}; &cu.getVelm().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
cu.executeKernel(kernel1, args1, numAtoms); cu.executeKernel(kernel1, args1, numAtoms);
...@@ -4160,8 +4162,9 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni ...@@ -4160,8 +4162,9 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
// Call the second integration kernel. // Call the second integration kernel.
void* args2[] = {cu.getUseDoublePrecision() ? (void*) &stepSize : (void*) &stepSizeFloat, CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
&cu.getPosq().getDevicePointer(), &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()}; void* args2[] = {useDouble ? (void*) &stepSize : (void*) &stepSizeFloat,
&cu.getPosq().getDevicePointer(), &posCorrection, &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms); cu.executeKernel(kernel2, args2, numAtoms);
integration.computeVirtualSites(); integration.computeVirtualSites();
...@@ -4354,7 +4357,7 @@ public: ...@@ -4354,7 +4357,7 @@ public:
return; return;
int numAtoms = cu.getNumAtoms(); int numAtoms = cu.getNumAtoms();
const vector<int>& order = cu.getAtomIndex(); const vector<int>& order = cu.getAtomIndex();
if (cu.getUseDoublePrecision()) { if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
if (deviceValuesAreCurrent) if (deviceValuesAreCurrent)
perDofValues.getParameterValues(localPerDofValuesDouble); perDofValues.getParameterValues(localPerDofValuesDouble);
vector<vector<double> > swap(3*numAtoms); vector<vector<double> > swap(3*numAtoms);
...@@ -4422,11 +4425,11 @@ void CudaIntegrateCustomStepKernel::initialize(const System& system, const Custo ...@@ -4422,11 +4425,11 @@ void CudaIntegrateCustomStepKernel::initialize(const System& system, const Custo
cu.setAsCurrent(); cu.setAsCurrent();
cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed()); cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
numGlobalVariables = integrator.getNumGlobalVariables(); numGlobalVariables = integrator.getNumGlobalVariables();
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); int elementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
globalValues = new CudaArray(cu, max(1, numGlobalVariables), elementSize, "globalVariables"); globalValues = new CudaArray(cu, max(1, numGlobalVariables), elementSize, "globalVariables");
sumBuffer = new CudaArray(cu, 3*system.getNumParticles(), elementSize, "sumBuffer"); sumBuffer = new CudaArray(cu, 3*system.getNumParticles(), elementSize, "sumBuffer");
energy = new CudaArray(cu, 1, elementSize, "energy"); energy = new CudaArray(cu, 1, cu.getEnergyBuffer().getElementSize(), "energy");
perDofValues = new CudaParameterSet(cu, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables", false, cu.getUseDoublePrecision()); perDofValues = new CudaParameterSet(cu, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables", false, cu.getUseDoublePrecision() || cu.getUseMixedPrecision());
cu.addReorderListener(new ReorderListener(cu, *perDofValues, localPerDofValuesFloat, localPerDofValuesDouble, deviceValuesAreCurrent)); cu.addReorderListener(new ReorderListener(cu, *perDofValues, localPerDofValuesFloat, localPerDofValuesDouble, deviceValuesAreCurrent));
prevStepSize = -1.0; prevStepSize = -1.0;
SimTKOpenMMUtilities::setRandomNumberSeed(integrator.getRandomNumberSeed()); SimTKOpenMMUtilities::setRandomNumberSeed(integrator.getRandomNumberSeed());
...@@ -4502,13 +4505,14 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -4502,13 +4505,14 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
CudaIntegrationUtilities& integration = cu.getIntegrationUtilities(); CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
int numAtoms = cu.getNumAtoms(); int numAtoms = cu.getNumAtoms();
int numSteps = integrator.getNumComputations(); int numSteps = integrator.getNumComputations();
bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();
if (!hasInitializedKernels) { if (!hasInitializedKernels) {
hasInitializedKernels = true; hasInitializedKernels = true;
// Initialize various data structures. // Initialize various data structures.
const map<string, double>& params = context.getParameters(); const map<string, double>& params = context.getParameters();
if (cu.getUseDoublePrecision()) { if (useDouble) {
contextParameterValues = CudaArray::create<double>(cu, max(1, (int) params.size()), "contextParameters"); contextParameterValues = CudaArray::create<double>(cu, max(1, (int) params.size()), "contextParameters");
contextValuesDouble.resize(contextParameterValues->getSize()); contextValuesDouble.resize(contextParameterValues->getSize());
for (map<string, double>::const_iterator iter = params.begin(); iter != params.end(); ++iter) { for (map<string, double>::const_iterator iter = params.begin(); iter != params.end(); ++iter) {
...@@ -4674,9 +4678,9 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -4674,9 +4678,9 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
compute << createPerDofComputation(stepType[j] == CustomIntegrator::ComputePerDof ? variable[j] : "", expression[j], i, integrator, forceName[j], energyName[j]); compute << createPerDofComputation(stepType[j] == CustomIntegrator::ComputePerDof ? variable[j] : "", expression[j], i, integrator, forceName[j], energyName[j]);
if (variable[j] == "x") { if (variable[j] == "x") {
if (storePosAsDelta[j]) if (storePosAsDelta[j])
compute << "posDelta[index] = convertFromDouble4(position-convertToDouble4(posq[index]));\n"; compute << "posDelta[index] = convertFromDouble4(position-convertToDouble4(loadPos(posq, posqCorrection, index)));\n";
else else
compute << "posq[index] = convertFromDouble4(position);\n"; compute << "storePos(posq, posqCorrection, index, convertFromDouble4(position));\n";
} }
else if (variable[j] == "v") else if (variable[j] == "v")
compute << "velm[index] = convertFromDouble4(velocity);\n"; compute << "velm[index] = convertFromDouble4(velocity);\n";
...@@ -4712,6 +4716,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -4712,6 +4716,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
requiredUniform[step] = numUniform; requiredUniform[step] = numUniform;
vector<void*> args1; vector<void*> args1;
args1.push_back(&cu.getPosq().getDevicePointer()); args1.push_back(&cu.getPosq().getDevicePointer());
args1.push_back(NULL);
args1.push_back(&integration.getPosDelta().getDevicePointer()); args1.push_back(&integration.getPosDelta().getDevicePointer());
args1.push_back(&cu.getVelm().getDevicePointer()); args1.push_back(&cu.getVelm().getDevicePointer());
args1.push_back(&cu.getForce().getDevicePointer()); args1.push_back(&cu.getForce().getDevicePointer());
...@@ -4749,7 +4754,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -4749,7 +4754,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
throw OpenMMException("Unknown global variable: "+variable[step]); throw OpenMMException("Unknown global variable: "+variable[step]);
defines["SUM_BUFFER_SIZE"] = cu.intToString(3*numAtoms); defines["SUM_BUFFER_SIZE"] = cu.intToString(3*numAtoms);
module = cu.createModule(CudaKernelSources::customIntegrator, defines); module = cu.createModule(CudaKernelSources::customIntegrator, defines);
kernel = cu.getKernel(module, "computeSum"); kernel = cu.getKernel(module, useDouble ? "computeDoubleSum" : "computeFloatSum");
kernels[step].push_back(kernel); kernels[step].push_back(kernel);
kernelArgs[step].push_back(args2); kernelArgs[step].push_back(args2);
} }
...@@ -4782,6 +4787,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -4782,6 +4787,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
kernels[step].push_back(kernel); kernels[step].push_back(kernel);
vector<void*> args; vector<void*> args;
args.push_back(&cu.getPosq().getDevicePointer()); args.push_back(&cu.getPosq().getDevicePointer());
args.push_back(NULL);
args.push_back(&integration.getPosDelta().getDevicePointer()); args.push_back(&integration.getPosDelta().getDevicePointer());
kernelArgs[step].push_back(args); kernelArgs[step].push_back(args);
} }
...@@ -4792,13 +4798,13 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -4792,13 +4798,13 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
defines["SUM_OUTPUT_INDEX"] = "0"; defines["SUM_OUTPUT_INDEX"] = "0";
defines["SUM_BUFFER_SIZE"] = cu.intToString(cu.getEnergyBuffer().getSize()); defines["SUM_BUFFER_SIZE"] = cu.intToString(cu.getEnergyBuffer().getSize());
CUmodule module = cu.createModule(CudaKernelSources::customIntegrator, defines); CUmodule module = cu.createModule(CudaKernelSources::customIntegrator, defines);
sumEnergyKernel = cu.getKernel(module, "computeSum"); sumEnergyKernel = cu.getKernel(module, cu.getUseDoublePrecision() ? "computeDoubleSum" : "computeFloatSum");
} }
// Make sure all values (variables, parameters, etc.) stored on the device are up to date. // Make sure all values (variables, parameters, etc.) stored on the device are up to date.
if (!deviceValuesAreCurrent) { if (!deviceValuesAreCurrent) {
if (cu.getUseDoublePrecision()) if (useDouble)
perDofValues->setParameterValues(localPerDofValuesDouble); perDofValues->setParameterValues(localPerDofValuesDouble);
else else
perDofValues->setParameterValues(localPerDofValuesFloat); perDofValues->setParameterValues(localPerDofValuesFloat);
...@@ -4807,7 +4813,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -4807,7 +4813,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
localValuesAreCurrent = false; localValuesAreCurrent = false;
double stepSize = integrator.getStepSize(); double stepSize = integrator.getStepSize();
if (stepSize != prevStepSize) { if (stepSize != prevStepSize) {
if (cu.getUseDoublePrecision()) { if (useDouble) {
double size[] = {0, stepSize}; double size[] = {0, stepSize};
integration.getStepSize().upload(size); integration.getStepSize().upload(size);
} }
...@@ -4818,7 +4824,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -4818,7 +4824,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
prevStepSize = stepSize; prevStepSize = stepSize;
} }
bool paramsChanged = false; bool paramsChanged = false;
if (cu.getUseDoublePrecision()) { if (useDouble) {
for (int i = 0; i < (int) parameterNames.size(); i++) { for (int i = 0; i < (int) parameterNames.size(); i++) {
double value = context.getParameter(parameterNames[i]); double value = context.getParameter(parameterNames[i]);
if (value != contextValuesDouble[i]) { if (value != contextValuesDouble[i]) {
...@@ -4844,6 +4850,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -4844,6 +4850,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
// Loop over computation steps in the integrator and execute them. // Loop over computation steps in the integrator and execute them.
void* randomArgs[] = {&uniformRandoms->getDevicePointer(), &randomSeed->getDevicePointer()}; void* randomArgs[] = {&uniformRandoms->getDevicePointer(), &randomSeed->getDevicePointer()};
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
for (int i = 0; i < numSteps; i++) { for (int i = 0; i < numSteps; i++) {
if ((needsForces[i] || needsEnergy[i]) && (!forcesAreValid || context.getLastForceGroups() != forceGroup[i])) { if ((needsForces[i] || needsEnergy[i]) && (!forcesAreValid || context.getLastForceGroups() != forceGroup[i])) {
// Recompute forces and/or energy. Figure out what is actually needed // Recompute forces and/or energy. Figure out what is actually needed
...@@ -4872,7 +4879,8 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -4872,7 +4879,8 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
} }
if (stepType[i] == CustomIntegrator::ComputePerDof && !merged[i]) { if (stepType[i] == CustomIntegrator::ComputePerDof && !merged[i]) {
int randomIndex = integration.prepareRandomNumbers(requiredGaussian[i]); int randomIndex = integration.prepareRandomNumbers(requiredGaussian[i]);
kernelArgs[i][0][9] = &randomIndex; kernelArgs[i][0][1] = &posCorrection;
kernelArgs[i][0][10] = &randomIndex;
if (requiredUniform[i] > 0) if (requiredUniform[i] > 0)
cu.executeKernel(randomKernel, &randomArgs[0], numAtoms); cu.executeKernel(randomKernel, &randomArgs[0], numAtoms);
cu.executeKernel(kernels[i][0], &kernelArgs[i][0][0], numAtoms); cu.executeKernel(kernels[i][0], &kernelArgs[i][0][0], numAtoms);
...@@ -4886,7 +4894,8 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -4886,7 +4894,8 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
} }
else if (stepType[i] == CustomIntegrator::ComputeSum) { else if (stepType[i] == CustomIntegrator::ComputeSum) {
int randomIndex = integration.prepareRandomNumbers(requiredGaussian[i]); int randomIndex = integration.prepareRandomNumbers(requiredGaussian[i]);
kernelArgs[i][0][9] = &randomIndex; kernelArgs[i][0][1] = &posCorrection;
kernelArgs[i][0][10] = &randomIndex;
if (requiredUniform[i] > 0) if (requiredUniform[i] > 0)
cu.executeKernel(randomKernel, &randomArgs[0], numAtoms); cu.executeKernel(randomKernel, &randomArgs[0], numAtoms);
cu.executeKernel(kernels[i][0], &kernelArgs[i][0][0], numAtoms); cu.executeKernel(kernels[i][0], &kernelArgs[i][0][0], numAtoms);
...@@ -4898,6 +4907,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -4898,6 +4907,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
} }
else if (stepType[i] == CustomIntegrator::ConstrainPositions) { else if (stepType[i] == CustomIntegrator::ConstrainPositions) {
cu.getIntegrationUtilities().applyConstraints(integrator.getConstraintTolerance()); cu.getIntegrationUtilities().applyConstraints(integrator.getConstraintTolerance());
kernelArgs[i][0][1] = &posCorrection;
cu.executeKernel(kernels[i][0], &kernelArgs[i][0][0], numAtoms); cu.executeKernel(kernels[i][0], &kernelArgs[i][0][0], numAtoms);
cu.getIntegrationUtilities().computeVirtualSites(); cu.getIntegrationUtilities().computeVirtualSites();
} }
...@@ -4918,7 +4928,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -4918,7 +4928,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
void CudaIntegrateCustomStepKernel::recordChangedParameters(ContextImpl& context) { void CudaIntegrateCustomStepKernel::recordChangedParameters(ContextImpl& context) {
if (!modifiesParameters) if (!modifiesParameters)
return; return;
if (cu.getUseDoublePrecision()) { if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
contextParameterValues->download(contextValuesDouble); contextParameterValues->download(contextValuesDouble);
for (int i = 0; i < (int) parameterNames.size(); i++) { for (int i = 0; i < (int) parameterNames.size(); i++) {
double value = context.getParameter(parameterNames[i]); double value = context.getParameter(parameterNames[i]);
...@@ -4940,7 +4950,7 @@ void CudaIntegrateCustomStepKernel::getGlobalVariables(ContextImpl& context, vec ...@@ -4940,7 +4950,7 @@ void CudaIntegrateCustomStepKernel::getGlobalVariables(ContextImpl& context, vec
values.resize(numGlobalVariables); values.resize(numGlobalVariables);
if (numGlobalVariables == 0) if (numGlobalVariables == 0)
return; return;
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision())
globalValues->download(values); globalValues->download(values);
else { else {
vector<float> buffer; vector<float> buffer;
...@@ -4953,7 +4963,7 @@ void CudaIntegrateCustomStepKernel::getGlobalVariables(ContextImpl& context, vec ...@@ -4953,7 +4963,7 @@ void CudaIntegrateCustomStepKernel::getGlobalVariables(ContextImpl& context, vec
void CudaIntegrateCustomStepKernel::setGlobalVariables(ContextImpl& context, const vector<double>& values) { void CudaIntegrateCustomStepKernel::setGlobalVariables(ContextImpl& context, const vector<double>& values) {
if (numGlobalVariables == 0) if (numGlobalVariables == 0)
return; return;
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision())
globalValues->upload(values); globalValues->upload(values);
else { else {
vector<float> buffer(numGlobalVariables); vector<float> buffer(numGlobalVariables);
...@@ -4966,7 +4976,7 @@ void CudaIntegrateCustomStepKernel::setGlobalVariables(ContextImpl& context, con ...@@ -4966,7 +4976,7 @@ void CudaIntegrateCustomStepKernel::setGlobalVariables(ContextImpl& context, con
void CudaIntegrateCustomStepKernel::getPerDofVariable(ContextImpl& context, int variable, vector<Vec3>& values) const { void CudaIntegrateCustomStepKernel::getPerDofVariable(ContextImpl& context, int variable, vector<Vec3>& values) const {
values.resize(perDofValues->getNumObjects()/3); values.resize(perDofValues->getNumObjects()/3);
const vector<int>& order = cu.getAtomIndex(); const vector<int>& order = cu.getAtomIndex();
if (cu.getUseDoublePrecision()) { if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
if (!localValuesAreCurrent) { if (!localValuesAreCurrent) {
perDofValues->getParameterValues(localPerDofValuesDouble); perDofValues->getParameterValues(localPerDofValuesDouble);
localValuesAreCurrent = true; localValuesAreCurrent = true;
...@@ -4988,7 +4998,7 @@ void CudaIntegrateCustomStepKernel::getPerDofVariable(ContextImpl& context, int ...@@ -4988,7 +4998,7 @@ void CudaIntegrateCustomStepKernel::getPerDofVariable(ContextImpl& context, int
void CudaIntegrateCustomStepKernel::setPerDofVariable(ContextImpl& context, int variable, const vector<Vec3>& values) { void CudaIntegrateCustomStepKernel::setPerDofVariable(ContextImpl& context, int variable, const vector<Vec3>& values) {
const vector<int>& order = cu.getAtomIndex(); const vector<int>& order = cu.getAtomIndex();
if (cu.getUseDoublePrecision()) { if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
if (!localValuesAreCurrent) { if (!localValuesAreCurrent) {
perDofValues->getParameterValues(localPerDofValuesDouble); perDofValues->getParameterValues(localPerDofValuesDouble);
localValuesAreCurrent = true; localValuesAreCurrent = true;
......
...@@ -2,12 +2,12 @@ ...@@ -2,12 +2,12 @@
* Perform the first step of Brownian integration. * Perform the first step of Brownian integration.
*/ */
extern "C" __global__ void integrateBrownianPart1(real tauDeltaT, real noiseAmplitude, const long long* __restrict__ force, extern "C" __global__ void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAmplitude, const long long* __restrict__ force,
real4* __restrict__ posDelta, const real4* __restrict__ velm, const float4* __restrict__ random, unsigned int randomIndex) { mixed4* __restrict__ posDelta, const mixed4* __restrict__ velm, const float4* __restrict__ random, unsigned int randomIndex) {
randomIndex += blockIdx.x*blockDim.x+threadIdx.x; randomIndex += blockIdx.x*blockDim.x+threadIdx.x;
const real fscale = tauDeltaT/(real) 0xFFFFFFFF; const mixed fscale = tauDeltaT/(mixed) 0xFFFFFFFF;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
real invMass = velm[index].w; mixed invMass = velm[index].w;
if (invMass != 0) { if (invMass != 0) {
posDelta[index].x = fscale*invMass*force[index] + noiseAmplitude*SQRT(invMass)*random[randomIndex].x; posDelta[index].x = fscale*invMass*force[index] + noiseAmplitude*SQRT(invMass)*random[randomIndex].x;
posDelta[index].y = fscale*invMass*force[index+PADDED_NUM_ATOMS] + noiseAmplitude*SQRT(invMass)*random[randomIndex].y; posDelta[index].y = fscale*invMass*force[index+PADDED_NUM_ATOMS] + noiseAmplitude*SQRT(invMass)*random[randomIndex].y;
...@@ -21,17 +21,30 @@ extern "C" __global__ void integrateBrownianPart1(real tauDeltaT, real noiseAmpl ...@@ -21,17 +21,30 @@ extern "C" __global__ void integrateBrownianPart1(real tauDeltaT, real noiseAmpl
* Perform the second step of Brownian integration. * Perform the second step of Brownian integration.
*/ */
extern "C" __global__ void integrateBrownianPart2(real deltaT, real4* posq, real4* velm, const real4* __restrict__ posDelta) { extern "C" __global__ void integrateBrownianPart2(mixed deltaT, real4* posq, real4* __restrict__ posqCorrection, mixed4* velm, const mixed4* __restrict__ posDelta) {
const real oneOverDeltaT = RECIP(deltaT); const mixed oneOverDeltaT = RECIP(deltaT);
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
if (velm[index].w != 0) { if (velm[index].w != 0) {
real4 delta = posDelta[index]; mixed4 delta = posDelta[index];
velm[index].x = oneOverDeltaT*delta.x; velm[index].x = oneOverDeltaT*delta.x;
velm[index].y = oneOverDeltaT*delta.y; velm[index].y = oneOverDeltaT*delta.y;
velm[index].z = oneOverDeltaT*delta.z; velm[index].z = oneOverDeltaT*delta.z;
posq[index].x = posq[index].x + delta.x; #ifdef USE_MIXED_PRECISION
posq[index].y = posq[index].y + delta.y; real4 pos1 = posq[index];
posq[index].z = posq[index].z + delta.z; real4 pos2 = posqCorrection[index];
mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index];
#endif
pos.x += delta.x;
pos.y += delta.y;
pos.z += delta.z;
#ifdef USE_MIXED_PRECISION
posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
} }
} }
} }
extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posDelta) { extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta) {
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
real4 position = posq[index]; #ifdef USE_MIXED_PRECISION
position.x += posDelta[index].x; real4 pos1 = posq[index];
position.y += posDelta[index].y; real4 pos2 = posqCorrection[index];
position.z += posDelta[index].z; mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
posq[index] = position; #else
mixed4 pos = posq[index];
#endif
pos.x += posDelta[index].x;
pos.y += posDelta[index].y;
pos.z += posDelta[index].z;
#ifdef USE_MIXED_PRECISION
posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
} }
} }
extern "C" __global__ void computeSum(const real* __restrict__ sumBuffer, real* result) { extern "C" __global__ void computeFloatSum(const float* __restrict__ sumBuffer, float* result) {
__shared__ real tempBuffer[WORK_GROUP_SIZE]; __shared__ float tempBuffer[WORK_GROUP_SIZE];
const unsigned int thread = threadIdx.x; const unsigned int thread = threadIdx.x;
real sum = 0; float sum = 0;
for (unsigned int index = thread; index < SUM_BUFFER_SIZE; index += blockDim.x) for (unsigned int index = thread; index < SUM_BUFFER_SIZE; index += blockDim.x)
sum += sumBuffer[index]; sum += sumBuffer[index];
tempBuffer[thread] = sum; tempBuffer[thread] = sum;
...@@ -14,14 +14,41 @@ extern "C" __global__ void computeSum(const real* __restrict__ sumBuffer, real* ...@@ -14,14 +14,41 @@ extern "C" __global__ void computeSum(const real* __restrict__ sumBuffer, real*
result[SUM_OUTPUT_INDEX] = tempBuffer[0]; result[SUM_OUTPUT_INDEX] = tempBuffer[0];
} }
extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posDelta) { extern "C" __global__ void computeDoubleSum(const double* __restrict__ sumBuffer, double* result) {
__shared__ double tempBuffer[WORK_GROUP_SIZE];
const unsigned int thread = threadIdx.x;
double sum = 0;
for (unsigned int index = thread; index < SUM_BUFFER_SIZE; index += blockDim.x)
sum += sumBuffer[index];
tempBuffer[thread] = sum;
for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
__syncthreads();
if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
tempBuffer[thread] += tempBuffer[thread+i];
}
if (thread == 0)
result[SUM_OUTPUT_INDEX] = tempBuffer[0];
}
extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta) {
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
real4 position = posq[index]; #ifdef USE_MIXED_PRECISION
position.x += posDelta[index].x; real4 pos1 = posq[index];
position.y += posDelta[index].y; real4 pos2 = posqCorrection[index];
position.z += posDelta[index].z; mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
posq[index] = position; #else
posDelta[index] = make_real4(0, 0, 0, 0); real4 pos = posq[index];
#endif
pos.x += posDelta[index].x;
pos.y += posDelta[index].y;
pos.z += posDelta[index].z;
#ifdef USE_MIXED_PRECISION
posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
posDelta[index] = make_mixed4(0, 0, 0, 0);
} }
} }
......
extern "C" __global__ void computeGlobal(real2* __restrict__ dt, real* __restrict__ globals, real* __restrict__ params, extern "C" __global__ void computeGlobal(mixed2* __restrict__ dt, mixed* __restrict__ globals, mixed* __restrict__ params,
float uniform, float gaussian, const real* __restrict__ energy) { float uniform, float gaussian, const real* __restrict__ energy) {
COMPUTE_STEP COMPUTE_STEP
} }
inline __device__ double4 convertToDouble4(real4 a) { /**
* Load the position of a particle.
*/
inline __device__ mixed4 loadPos(const real4* __restrict__ posq, const real4* __restrict__ posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
return make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
return posq[index];
#endif
}
/**
* Store the position of a particle.
*/
inline __device__ void storePos(real4* __restrict__ posq, real4* __restrict__ posqCorrection, int index, mixed4 pos) {
#ifdef USE_MIXED_PRECISION
posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
}
inline __device__ double4 convertToDouble4(mixed4 a) {
return make_double4(a.x, a.y, a.z, a.w); return make_double4(a.x, a.y, a.z, a.w);
} }
inline __device__ real4 convertFromDouble4(double4 a) { inline __device__ mixed4 convertFromDouble4(double4 a) {
return make_real4(a.x, a.y, a.z, a.w); return make_mixed4(a.x, a.y, a.z, a.w);
} }
extern "C" __global__ void computePerDof(real4* __restrict__ posq, real4* __restrict__ posDelta, real4* __restrict__ velm, extern "C" __global__ void computePerDof(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta,
const long long* __restrict__ force, const real2* __restrict__ dt, const real* __restrict__ globals, mixed4* __restrict__ velm, const long long* __restrict__ force, const mixed2* __restrict__ dt, const mixed* __restrict__ globals,
const real* __restrict__ params, real* __restrict__ sum, const float4* __restrict__ gaussianValues, const mixed* __restrict__ params, mixed* __restrict__ sum, const float4* __restrict__ gaussianValues,
unsigned int randomIndex, const float4* __restrict__ uniformValues, const real* __restrict__ energy unsigned int randomIndex, const float4* __restrict__ uniformValues, const real* __restrict__ energy
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
real stepSize = dt[0].y; mixed stepSize = dt[0].y;
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = blockIdx.x*blockDim.x+threadIdx.x;
randomIndex += index; randomIndex += index;
const double forceScale = 1.0/0xFFFFFFFF; const double forceScale = 1.0/0xFFFFFFFF;
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
#ifdef LOAD_POS_AS_DELTA #ifdef LOAD_POS_AS_DELTA
double4 position = convertToDouble4(posq[index]+posDelta[index]); double4 position = convertToDouble4(loadPos(posq, posqCorrection, index)+posDelta[index]);
#else #else
double4 position = convertToDouble4(posq[index]); double4 position = convertToDouble4(loadPos(posq, posqCorrection, index));
#endif #endif
double4 velocity = convertToDouble4(velm[index]); double4 velocity = convertToDouble4(velm[index]);
double4 f = make_double4(forceScale*force[index], forceScale*force[index+PADDED_NUM_ATOMS], forceScale*force[index+PADDED_NUM_ATOMS*2], 0.0); double4 f = make_double4(forceScale*force[index], forceScale*force[index+PADDED_NUM_ATOMS], forceScale*force[index+PADDED_NUM_ATOMS*2], 0.0);
......
...@@ -779,10 +779,11 @@ inline __device__ real3 loadForce(int index, long long* __restrict__ force) { ...@@ -779,10 +779,11 @@ inline __device__ real3 loadForce(int index, long long* __restrict__ force) {
return make_real3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]); return make_real3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]);
} }
inline __device__ void storeForce(int index, long long* __restrict__ force, real3 value) { inline __device__ void addForce(int index, long long* __restrict__ force, real3 value) {
force[index] = (long long) (value.x*0xFFFFFFFF); unsigned long long* f = (unsigned long long*) force;
force[index+PADDED_NUM_ATOMS] = (long long) (value.y*0xFFFFFFFF); atomicAdd(&f[index], static_cast<unsigned long long>((long long) (value.x*0xFFFFFFFF)));
force[index+2*PADDED_NUM_ATOMS] = (long long) (value.z*0xFFFFFFFF); atomicAdd(&f[index+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (value.y*0xFFFFFFFF)));
atomicAdd(&f[index+PADDED_NUM_ATOMS*2], static_cast<unsigned long long>((long long) (value.z*0xFFFFFFFF)));
} }
/** /**
...@@ -799,12 +800,8 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__ ...@@ -799,12 +800,8 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
int4 atoms = avg2Atoms[index]; int4 atoms = avg2Atoms[index];
real2 weights = avg2Weights[index]; real2 weights = avg2Weights[index];
real3 f = loadForce(atoms.x, force); real3 f = loadForce(atoms.x, force);
real3 f1 = loadForce(atoms.y, force); addForce(atoms.y, force, f*weights.x);
real3 f2 = loadForce(atoms.z, force); addForce(atoms.z, force, f*weights.y);
f1 += f*weights.x;
f2 += f*weights.y;
storeForce(atoms.y, force, f1);
storeForce(atoms.z, force, f2);
} }
// Three particle average sites. // Three particle average sites.
...@@ -813,15 +810,9 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__ ...@@ -813,15 +810,9 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
int4 atoms = avg3Atoms[index]; int4 atoms = avg3Atoms[index];
real4 weights = avg3Weights[index]; real4 weights = avg3Weights[index];
real3 f = loadForce(atoms.x, force); real3 f = loadForce(atoms.x, force);
real3 f1 = loadForce(atoms.y, force); addForce(atoms.y, force, f*weights.x);
real3 f2 = loadForce(atoms.z, force); addForce(atoms.z, force, f*weights.y);
real3 f3 = loadForce(atoms.w, force); addForce(atoms.w, force, f*weights.z);
f1 += f*weights.x;
f2 += f*weights.y;
f3 += f*weights.z;
storeForce(atoms.y, force, f1);
storeForce(atoms.z, force, f2);
storeForce(atoms.w, force, f3);
} }
// Out of plane sites. // Out of plane sites.
...@@ -835,20 +826,14 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__ ...@@ -835,20 +826,14 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
mixed4 v12 = pos2-pos1; mixed4 v12 = pos2-pos1;
mixed4 v13 = pos3-pos1; mixed4 v13 = pos3-pos1;
real3 f = loadForce(atoms.x, force); real3 f = loadForce(atoms.x, force);
real3 f1 = loadForce(atoms.y, force);
real3 f2 = loadForce(atoms.z, force);
real3 f3 = loadForce(atoms.w, force);
real3 fp2 = make_real3((real) (weights.x*f.x - weights.z*v13.z*f.y + weights.z*v13.y*f.z), real3 fp2 = make_real3((real) (weights.x*f.x - weights.z*v13.z*f.y + weights.z*v13.y*f.z),
(real) (weights.z*v13.z*f.x + weights.x*f.y - weights.z*v13.x*f.z), (real) (weights.z*v13.z*f.x + weights.x*f.y - weights.z*v13.x*f.z),
(real) (-weights.z*v13.y*f.x + weights.z*v13.x*f.y + weights.x*f.z)); (real) (-weights.z*v13.y*f.x + weights.z*v13.x*f.y + weights.x*f.z));
real3 fp3 = make_real3((real) (weights.y*f.x + weights.z*v12.z*f.y - weights.z*v12.y*f.z), real3 fp3 = make_real3((real) (weights.y*f.x + weights.z*v12.z*f.y - weights.z*v12.y*f.z),
(real) (-weights.z*v12.z*f.x + weights.y*f.y + weights.z*v12.x*f.z), (real) (-weights.z*v12.z*f.x + weights.y*f.y + weights.z*v12.x*f.z),
(real) (weights.z*v12.y*f.x - weights.z*v12.x*f.y + weights.y*f.z)); (real) (weights.z*v12.y*f.x - weights.z*v12.x*f.y + weights.y*f.z));
f1 += f-fp2-fp3; addForce(atoms.y, force, f-fp2-fp3);
f2 += fp2; addForce(atoms.z, force, fp2);
f3 += fp3; addForce(atoms.w, force, fp3);
storeForce(atoms.y, force, f1);
storeForce(atoms.z, force, f2);
storeForce(atoms.w, force, f3);
} }
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment