Commit 50eb44c5 authored by peastman's avatar peastman
Browse files

Eliminated some #defines to improve the efficiency of PTX caching

parent 00f51540
...@@ -388,7 +388,6 @@ void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) { ...@@ -388,7 +388,6 @@ void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
if (!hasInitializedKernel) { if (!hasInitializedKernel) {
hasInitializedKernel = true; hasInitializedKernel = true;
map<string, string> defines; map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
CUmodule module = cu.createModule(CudaKernelSources::constraints, defines); CUmodule module = cu.createModule(CudaKernelSources::constraints, defines);
applyDeltasKernel = cu.getKernel(module, "applyPositionDeltas"); applyDeltasKernel = cu.getKernel(module, "applyPositionDeltas");
} }
...@@ -396,7 +395,8 @@ void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) { ...@@ -396,7 +395,8 @@ void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
cu.clearBuffer(integration.getPosDelta()); cu.clearBuffer(integration.getPosDelta());
integration.applyConstraints(tol); integration.applyConstraints(tol);
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args[] = {&cu.getPosq().getDevicePointer(), &posCorrection, &cu.getIntegrationUtilities().getPosDelta().getDevicePointer()}; int numAtoms = cu.getNumAtoms();
void* args[] = {&numAtoms, &cu.getPosq().getDevicePointer(), &posCorrection, &cu.getIntegrationUtilities().getPosDelta().getDevicePointer()};
cu.executeKernel(applyDeltasKernel, args, cu.getNumAtoms()); cu.executeKernel(applyDeltasKernel, args, cu.getNumAtoms());
integration.computeVirtualSites(); integration.computeVirtualSites();
} }
...@@ -4156,8 +4156,6 @@ void CudaIntegrateVerletStepKernel::initialize(const System& system, const Verle ...@@ -4156,8 +4156,6 @@ void CudaIntegrateVerletStepKernel::initialize(const System& system, const Verle
cu.getPlatformData().initializeContexts(system); cu.getPlatformData().initializeContexts(system);
cu.setAsCurrent(); cu.setAsCurrent();
map<string, string> defines; map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
CUmodule module = cu.createModule(CudaKernelSources::verlet, defines, ""); CUmodule module = cu.createModule(CudaKernelSources::verlet, defines, "");
kernel1 = cu.getKernel(module, "integrateVerletPart1"); kernel1 = cu.getKernel(module, "integrateVerletPart1");
kernel2 = cu.getKernel(module, "integrateVerletPart2"); kernel2 = cu.getKernel(module, "integrateVerletPart2");
...@@ -4168,6 +4166,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -4168,6 +4166,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
cu.setAsCurrent(); cu.setAsCurrent();
CudaIntegrationUtilities& integration = cu.getIntegrationUtilities(); CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
int numAtoms = cu.getNumAtoms(); int numAtoms = cu.getNumAtoms();
int paddedNumAtoms = cu.getPaddedNumAtoms();
double dt = integrator.getStepSize(); double dt = integrator.getStepSize();
if (dt != prevStepSize) { if (dt != prevStepSize) {
if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) { if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
...@@ -4186,7 +4185,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -4186,7 +4185,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
// Call the first integration kernel. // Call the first integration kernel.
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args1[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection, void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel1, args1, numAtoms); cu.executeKernel(kernel1, args1, numAtoms);
...@@ -4196,7 +4195,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -4196,7 +4195,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
// Call the second integration kernel. // Call the second integration kernel.
void* args2[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection, void* args2[] = {&numAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms); cu.executeKernel(kernel2, args2, numAtoms);
integration.computeVirtualSites(); integration.computeVirtualSites();
...@@ -4223,8 +4222,6 @@ void CudaIntegrateLangevinStepKernel::initialize(const System& system, const Lan ...@@ -4223,8 +4222,6 @@ void CudaIntegrateLangevinStepKernel::initialize(const System& system, const Lan
cu.setAsCurrent(); cu.setAsCurrent();
cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed()); cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
map<string, string> defines; map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, ""); CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, "");
kernel1 = cu.getKernel(module, "integrateLangevinPart1"); kernel1 = cu.getKernel(module, "integrateLangevinPart1");
kernel2 = cu.getKernel(module, "integrateLangevinPart2"); kernel2 = cu.getKernel(module, "integrateLangevinPart2");
...@@ -4236,6 +4233,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev ...@@ -4236,6 +4233,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
cu.setAsCurrent(); cu.setAsCurrent();
CudaIntegrationUtilities& integration = cu.getIntegrationUtilities(); CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
int numAtoms = cu.getNumAtoms(); int numAtoms = cu.getNumAtoms();
int paddedNumAtoms = cu.getPaddedNumAtoms();
double temperature = integrator.getTemperature(); double temperature = integrator.getTemperature();
double friction = integrator.getFriction(); double friction = integrator.getFriction();
double stepSize = integrator.getStepSize(); double stepSize = integrator.getStepSize();
...@@ -4273,7 +4271,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev ...@@ -4273,7 +4271,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
// Call the first integration kernel. // Call the first integration kernel.
int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms()); int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
void* args1[] = {&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(), void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
&params->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex}; &params->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
cu.executeKernel(kernel1, args1, numAtoms); cu.executeKernel(kernel1, args1, numAtoms);
...@@ -4284,7 +4282,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev ...@@ -4284,7 +4282,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
// Call the second integration kernel. // Call the second integration kernel.
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args2[] = {&cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(), void* args2[] = {&numAtoms, &cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms); cu.executeKernel(kernel2, args2, numAtoms);
integration.computeVirtualSites(); integration.computeVirtualSites();
...@@ -4308,8 +4306,6 @@ void CudaIntegrateBrownianStepKernel::initialize(const System& system, const Bro ...@@ -4308,8 +4306,6 @@ void CudaIntegrateBrownianStepKernel::initialize(const System& system, const Bro
cu.setAsCurrent(); cu.setAsCurrent();
cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed()); cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
map<string, string> defines; map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
CUmodule module = cu.createModule(CudaKernelSources::brownian, defines, ""); CUmodule module = cu.createModule(CudaKernelSources::brownian, defines, "");
kernel1 = cu.getKernel(module, "integrateBrownianPart1"); kernel1 = cu.getKernel(module, "integrateBrownianPart1");
kernel2 = cu.getKernel(module, "integrateBrownianPart2"); kernel2 = cu.getKernel(module, "integrateBrownianPart2");
...@@ -4320,6 +4316,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni ...@@ -4320,6 +4316,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
cu.setAsCurrent(); cu.setAsCurrent();
CudaIntegrationUtilities& integration = cu.getIntegrationUtilities(); CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
int numAtoms = cu.getNumAtoms(); int numAtoms = cu.getNumAtoms();
int paddedNumAtoms = cu.getPaddedNumAtoms();
double temperature = integrator.getTemperature(); double temperature = integrator.getTemperature();
double friction = integrator.getFriction(); double friction = integrator.getFriction();
double stepSize = integrator.getStepSize(); double stepSize = integrator.getStepSize();
...@@ -4334,7 +4331,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni ...@@ -4334,7 +4331,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
// Call the first integration kernel. // Call the first integration kernel.
int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms()); int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
void* args1[] = {useDouble ? (void*) &tauDt : (void*) &tauDtFloat, void* args1[] = {&numAtoms, &paddedNumAtoms, useDouble ? (void*) &tauDt : (void*) &tauDtFloat,
useDouble ? (void*) &noise : (void*) &noiseFloat, useDouble ? (void*) &noise : (void*) &noiseFloat,
&cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex}; &cu.getVelm().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
...@@ -4347,7 +4344,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni ...@@ -4347,7 +4344,7 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
// Call the second integration kernel. // Call the second integration kernel.
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args2[] = {useDouble ? (void*) &stepSize : (void*) &stepSizeFloat, void* args2[] = {&numAtoms, useDouble ? (void*) &stepSize : (void*) &stepSizeFloat,
&cu.getPosq().getDevicePointer(), &posCorrection, &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()}; &cu.getPosq().getDevicePointer(), &posCorrection, &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms); cu.executeKernel(kernel2, args2, numAtoms);
integration.computeVirtualSites(); integration.computeVirtualSites();
...@@ -4370,8 +4367,6 @@ void CudaIntegrateVariableVerletStepKernel::initialize(const System& system, con ...@@ -4370,8 +4367,6 @@ void CudaIntegrateVariableVerletStepKernel::initialize(const System& system, con
cu.getPlatformData().initializeContexts(system); cu.getPlatformData().initializeContexts(system);
cu.setAsCurrent(); cu.setAsCurrent();
map<string, string> defines; map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
CUmodule module = cu.createModule(CudaKernelSources::verlet, defines, ""); CUmodule module = cu.createModule(CudaKernelSources::verlet, defines, "");
kernel1 = cu.getKernel(module, "integrateVerletPart1"); kernel1 = cu.getKernel(module, "integrateVerletPart1");
kernel2 = cu.getKernel(module, "integrateVerletPart2"); kernel2 = cu.getKernel(module, "integrateVerletPart2");
...@@ -4383,6 +4378,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons ...@@ -4383,6 +4378,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
cu.setAsCurrent(); cu.setAsCurrent();
CudaIntegrationUtilities& integration = cu.getIntegrationUtilities(); CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
int numAtoms = cu.getNumAtoms(); int numAtoms = cu.getNumAtoms();
int paddedNumAtoms = cu.getPaddedNumAtoms();
// Select the step size to use. // Select the step size to use.
...@@ -4391,7 +4387,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons ...@@ -4391,7 +4387,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
double tol = integrator.getErrorTolerance(); double tol = integrator.getErrorTolerance();
float tolFloat = (float) tol; float tolFloat = (float) tol;
bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision(); bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();
void* argsSelect[] = {useDouble ? (void*) &maxStepSize : (void*) &maxStepSizeFloat, void* argsSelect[] = {&numAtoms, &paddedNumAtoms, useDouble ? (void*) &maxStepSize : (void*) &maxStepSizeFloat,
useDouble ? (void*) &tol : (void*) &tolFloat, useDouble ? (void*) &tol : (void*) &tolFloat,
&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getIntegrationUtilities().getStepSize().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer()};
...@@ -4401,7 +4397,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons ...@@ -4401,7 +4397,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
// Call the first integration kernel. // Call the first integration kernel.
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args1[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection, void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel1, args1, numAtoms); cu.executeKernel(kernel1, args1, numAtoms);
...@@ -4411,7 +4407,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons ...@@ -4411,7 +4407,7 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
// Call the second integration kernel. // Call the second integration kernel.
void* args2[] = {&cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection, void* args2[] = {&numAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
&cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms); cu.executeKernel(kernel2, args2, numAtoms);
integration.computeVirtualSites(); integration.computeVirtualSites();
...@@ -4456,8 +4452,6 @@ void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, c ...@@ -4456,8 +4452,6 @@ void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, c
cu.setAsCurrent(); cu.setAsCurrent();
cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed()); cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
map<string, string> defines; map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, ""); CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, "");
kernel1 = cu.getKernel(module, "integrateLangevinPart1"); kernel1 = cu.getKernel(module, "integrateLangevinPart1");
kernel2 = cu.getKernel(module, "integrateLangevinPart2"); kernel2 = cu.getKernel(module, "integrateLangevinPart2");
...@@ -4471,6 +4465,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co ...@@ -4471,6 +4465,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
cu.setAsCurrent(); cu.setAsCurrent();
CudaIntegrationUtilities& integration = cu.getIntegrationUtilities(); CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
int numAtoms = cu.getNumAtoms(); int numAtoms = cu.getNumAtoms();
int paddedNumAtoms = cu.getPaddedNumAtoms();
// Select the step size to use. // Select the step size to use.
...@@ -4483,7 +4478,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co ...@@ -4483,7 +4478,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
double kT = BOLTZ*integrator.getTemperature(); double kT = BOLTZ*integrator.getTemperature();
float kTFloat = (float) kT; float kTFloat = (float) kT;
bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision(); bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();
void* argsSelect[] = {useDouble ? (void*) &maxStepSize : (void*) &maxStepSizeFloat, void* argsSelect[] = {&numAtoms, &paddedNumAtoms, useDouble ? (void*) &maxStepSize : (void*) &maxStepSizeFloat,
useDouble ? (void*) &tol : (void*) &tolFloat, useDouble ? (void*) &tol : (void*) &tolFloat,
useDouble ? (void*) &tau : (void*) &tauFloat, useDouble ? (void*) &tau : (void*) &tauFloat,
useDouble ? (void*) &kT : (void*) &kTFloat, useDouble ? (void*) &kT : (void*) &kTFloat,
...@@ -4495,7 +4490,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co ...@@ -4495,7 +4490,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
// Call the first integration kernel. // Call the first integration kernel.
int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms()); int randomIndex = integration.prepareRandomNumbers(cu.getPaddedNumAtoms());
void* args1[] = {&cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(), void* args1[] = {&numAtoms, &paddedNumAtoms, &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
&params->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex}; &params->getDevicePointer(), &integration.getStepSize().getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex};
cu.executeKernel(kernel1, args1, numAtoms); cu.executeKernel(kernel1, args1, numAtoms);
...@@ -4506,7 +4501,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co ...@@ -4506,7 +4501,7 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
// Call the second integration kernel. // Call the second integration kernel.
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
void* args2[] = {&cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(), void* args2[] = {&numAtoms, &cu.getPosq().getDevicePointer(), &posCorrection, &integration.getPosDelta().getDevicePointer(),
&cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()}; &cu.getVelm().getDevicePointer(), &integration.getStepSize().getDevicePointer()};
cu.executeKernel(kernel2, args2, numAtoms); cu.executeKernel(kernel2, args2, numAtoms);
integration.computeVirtualSites(); integration.computeVirtualSites();
...@@ -5369,7 +5364,6 @@ void CudaApplyAndersenThermostatKernel::initialize(const System& system, const A ...@@ -5369,7 +5364,6 @@ void CudaApplyAndersenThermostatKernel::initialize(const System& system, const A
cu.setAsCurrent(); cu.setAsCurrent();
randomSeed = thermostat.getRandomNumberSeed(); randomSeed = thermostat.getRandomNumberSeed();
map<string, string> defines; map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
CUmodule module = cu.createModule(CudaKernelSources::andersenThermostat, defines); CUmodule module = cu.createModule(CudaKernelSources::andersenThermostat, defines);
kernel = cu.getKernel(module, "applyAndersenThermostat"); kernel = cu.getKernel(module, "applyAndersenThermostat");
cu.getIntegrationUtilities().initRandomNumberGenerator(randomSeed); cu.getIntegrationUtilities().initRandomNumberGenerator(randomSeed);
...@@ -5391,7 +5385,8 @@ void CudaApplyAndersenThermostatKernel::execute(ContextImpl& context) { ...@@ -5391,7 +5385,8 @@ void CudaApplyAndersenThermostatKernel::execute(ContextImpl& context) {
float frequency = (float) context.getParameter(AndersenThermostat::CollisionFrequency()); float frequency = (float) context.getParameter(AndersenThermostat::CollisionFrequency());
float kT = (float) (BOLTZ*context.getParameter(AndersenThermostat::Temperature())); float kT = (float) (BOLTZ*context.getParameter(AndersenThermostat::Temperature()));
int randomIndex = cu.getIntegrationUtilities().prepareRandomNumbers(cu.getPaddedNumAtoms()); int randomIndex = cu.getIntegrationUtilities().prepareRandomNumbers(cu.getPaddedNumAtoms());
void* args[] = {&frequency, &kT, &cu.getVelm().getDevicePointer(), &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), int numAtoms = cu.getNumAtoms();
void* args[] = {&numAtoms, &frequency, &kT, &cu.getVelm().getDevicePointer(), &cu.getIntegrationUtilities().getStepSize().getDevicePointer(),
&cu.getIntegrationUtilities().getRandom().getDevicePointer(), &randomIndex, &atomGroups->getDevicePointer()}; &cu.getIntegrationUtilities().getRandom().getDevicePointer(), &randomIndex, &atomGroups->getDevicePointer()};
cu.executeKernel(kernel, args, cu.getNumAtoms()); cu.executeKernel(kernel, args, cu.getNumAtoms());
} }
......
...@@ -2,11 +2,11 @@ ...@@ -2,11 +2,11 @@
* Apply the Andersen thermostat to adjust particle velocities. * Apply the Andersen thermostat to adjust particle velocities.
*/ */
extern "C" __global__ void applyAndersenThermostat(float collisionFrequency, float kT, mixed4* velm, const mixed4* __restrict__ stepSize, const float4* __restrict__ random, extern "C" __global__ void applyAndersenThermostat(int numAtoms, float collisionFrequency, float kT, mixed4* velm, const mixed4* __restrict__ stepSize, const float4* __restrict__ random,
unsigned int randomIndex, const int* __restrict__ atomGroups) { unsigned int randomIndex, const int* __restrict__ atomGroups) {
float collisionProbability = 1.0f-expf(-(float) (collisionFrequency*stepSize[0].y)); float collisionProbability = 1.0f-expf(-(float) (collisionFrequency*stepSize[0].y));
float randomRange = erff(collisionProbability/sqrtf(2.0f)); float randomRange = erff(collisionProbability/sqrtf(2.0f));
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
mixed4 velocity = velm[index]; mixed4 velocity = velm[index];
float4 selectRand = random[randomIndex+atomGroups[index]]; float4 selectRand = random[randomIndex+atomGroups[index]];
float4 velRand = random[randomIndex+index]; float4 velRand = random[randomIndex+index];
......
...@@ -2,16 +2,16 @@ ...@@ -2,16 +2,16 @@
* Perform the first step of Brownian integration. * Perform the first step of Brownian integration.
*/ */
extern "C" __global__ void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAmplitude, const long long* __restrict__ force, extern "C" __global__ void integrateBrownianPart1(int numAtoms, int paddedNumAtoms, mixed tauDeltaT, mixed noiseAmplitude, const long long* __restrict__ force,
mixed4* __restrict__ posDelta, const mixed4* __restrict__ velm, const float4* __restrict__ random, unsigned int randomIndex) { mixed4* __restrict__ posDelta, const mixed4* __restrict__ velm, const float4* __restrict__ random, unsigned int randomIndex) {
randomIndex += blockIdx.x*blockDim.x+threadIdx.x; randomIndex += blockIdx.x*blockDim.x+threadIdx.x;
const mixed fscale = tauDeltaT/(mixed) 0x100000000; const mixed fscale = tauDeltaT/(mixed) 0x100000000;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
mixed invMass = velm[index].w; mixed invMass = velm[index].w;
if (invMass != 0) { if (invMass != 0) {
posDelta[index].x = fscale*invMass*force[index] + noiseAmplitude*SQRT(invMass)*random[randomIndex].x; posDelta[index].x = fscale*invMass*force[index] + noiseAmplitude*SQRT(invMass)*random[randomIndex].x;
posDelta[index].y = fscale*invMass*force[index+PADDED_NUM_ATOMS] + noiseAmplitude*SQRT(invMass)*random[randomIndex].y; posDelta[index].y = fscale*invMass*force[index+paddedNumAtoms] + noiseAmplitude*SQRT(invMass)*random[randomIndex].y;
posDelta[index].z = fscale*invMass*force[index+PADDED_NUM_ATOMS*2] + noiseAmplitude*SQRT(invMass)*random[randomIndex].z; posDelta[index].z = fscale*invMass*force[index+paddedNumAtoms*2] + noiseAmplitude*SQRT(invMass)*random[randomIndex].z;
} }
randomIndex += blockDim.x*gridDim.x; randomIndex += blockDim.x*gridDim.x;
} }
...@@ -21,9 +21,9 @@ extern "C" __global__ void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAm ...@@ -21,9 +21,9 @@ extern "C" __global__ void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAm
* Perform the second step of Brownian integration. * Perform the second step of Brownian integration.
*/ */
extern "C" __global__ void integrateBrownianPart2(mixed deltaT, real4* posq, real4* __restrict__ posqCorrection, mixed4* velm, const mixed4* __restrict__ posDelta) { extern "C" __global__ void integrateBrownianPart2(int numAtoms, mixed deltaT, real4* posq, real4* __restrict__ posqCorrection, mixed4* velm, const mixed4* __restrict__ posDelta) {
const mixed oneOverDeltaT = RECIP(deltaT); const mixed oneOverDeltaT = RECIP(deltaT);
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
if (velm[index].w != 0) { if (velm[index].w != 0) {
mixed4 delta = posDelta[index]; mixed4 delta = posDelta[index];
velm[index].x = oneOverDeltaT*delta.x; velm[index].x = oneOverDeltaT*delta.x;
......
extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta) { extern "C" __global__ void applyPositionDeltas(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta) {
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
#ifdef USE_MIXED_PRECISION #ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index]; real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index]; real4 pos2 = posqCorrection[index];
......
...@@ -4,7 +4,7 @@ enum {VelScale, ForceScale, NoiseScale, MaxParams}; ...@@ -4,7 +4,7 @@ enum {VelScale, ForceScale, NoiseScale, MaxParams};
* Perform the first step of Langevin integration. * Perform the first step of Langevin integration.
*/ */
extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta, extern "C" __global__ void integrateLangevinPart1(int numAtoms, int paddedNumAtoms, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta,
const mixed* __restrict__ paramBuffer, const mixed2* __restrict__ dt, const float4* __restrict__ random, unsigned int randomIndex) { const mixed* __restrict__ paramBuffer, const mixed2* __restrict__ dt, const float4* __restrict__ random, unsigned int randomIndex) {
mixed vscale = paramBuffer[VelScale]; mixed vscale = paramBuffer[VelScale];
mixed fscale = paramBuffer[ForceScale]/(mixed) 0x100000000; mixed fscale = paramBuffer[ForceScale]/(mixed) 0x100000000;
...@@ -12,13 +12,13 @@ extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, con ...@@ -12,13 +12,13 @@ extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, con
mixed stepSize = dt[0].y; mixed stepSize = dt[0].y;
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = blockIdx.x*blockDim.x+threadIdx.x;
randomIndex += index; randomIndex += index;
while (index < NUM_ATOMS) { while (index < numAtoms) {
mixed4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0) { if (velocity.w != 0) {
mixed sqrtInvMass = SQRT(velocity.w); mixed sqrtInvMass = SQRT(velocity.w);
velocity.x = vscale*velocity.x + fscale*velocity.w*force[index] + noisescale*sqrtInvMass*random[randomIndex].x; velocity.x = vscale*velocity.x + fscale*velocity.w*force[index] + noisescale*sqrtInvMass*random[randomIndex].x;
velocity.y = vscale*velocity.y + fscale*velocity.w*force[index+PADDED_NUM_ATOMS] + noisescale*sqrtInvMass*random[randomIndex].y; velocity.y = vscale*velocity.y + fscale*velocity.w*force[index+paddedNumAtoms] + noisescale*sqrtInvMass*random[randomIndex].y;
velocity.z = vscale*velocity.z + fscale*velocity.w*force[index+PADDED_NUM_ATOMS*2] + noisescale*sqrtInvMass*random[randomIndex].z; velocity.z = vscale*velocity.z + fscale*velocity.w*force[index+paddedNumAtoms*2] + noisescale*sqrtInvMass*random[randomIndex].z;
velm[index] = velocity; velm[index] = velocity;
posDelta[index] = make_mixed4(stepSize*velocity.x, stepSize*velocity.y, stepSize*velocity.z, 0); posDelta[index] = make_mixed4(stepSize*velocity.x, stepSize*velocity.y, stepSize*velocity.z, 0);
} }
...@@ -31,7 +31,7 @@ extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, con ...@@ -31,7 +31,7 @@ extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, con
* Perform the second step of Langevin integration. * Perform the second step of Langevin integration.
*/ */
extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real4* __restrict__ posqCorrection, const mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const mixed2* __restrict__ dt) { extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, const mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const mixed2* __restrict__ dt) {
#if __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
double invStepSize = 1.0/dt[0].y; double invStepSize = 1.0/dt[0].y;
#else #else
...@@ -39,7 +39,7 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real ...@@ -39,7 +39,7 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real
float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y; float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y;
#endif #endif
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = blockIdx.x*blockDim.x+threadIdx.x;
while (index < NUM_ATOMS) { while (index < numAtoms) {
mixed4 vel = velm[index]; mixed4 vel = velm[index];
if (vel.w != 0) { if (vel.w != 0) {
#ifdef USE_MIXED_PRECISION #ifdef USE_MIXED_PRECISION
...@@ -78,7 +78,7 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real ...@@ -78,7 +78,7 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real
* Select the step size to use for the next step. * Select the step size to use for the next step.
*/ */
extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed tau, mixed kT, mixed2* __restrict__ dt, extern "C" __global__ void selectLangevinStepSize(int numAtoms, int paddedNumAtoms, mixed maxStepSize, mixed errorTol, mixed tau, mixed kT, mixed2* __restrict__ dt,
const mixed4* __restrict__ velm, const long long* __restrict__ force, mixed* __restrict__ paramBuffer) { const mixed4* __restrict__ velm, const long long* __restrict__ force, mixed* __restrict__ paramBuffer) {
// Calculate the error. // Calculate the error.
...@@ -87,8 +87,8 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error ...@@ -87,8 +87,8 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error
mixed err = 0; mixed err = 0;
unsigned int index = threadIdx.x; unsigned int index = threadIdx.x;
const mixed scale = RECIP((mixed) 0x100000000); const mixed scale = RECIP((mixed) 0x100000000);
while (index < NUM_ATOMS) { while (index < numAtoms) {
mixed3 f = make_mixed3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]); mixed3 f = make_mixed3(scale*force[index], scale*force[index+paddedNumAtoms], scale*force[index+paddedNumAtoms*2]);
mixed invMass = velm[index].w; mixed invMass = velm[index].w;
err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass; err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
index += blockDim.x*gridDim.x; index += blockDim.x*gridDim.x;
...@@ -106,7 +106,7 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error ...@@ -106,7 +106,7 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error
if (blockIdx.x*blockDim.x+threadIdx.x == 0) { if (blockIdx.x*blockDim.x+threadIdx.x == 0) {
// Select the new step size. // Select the new step size.
mixed totalError = SQRT(error[0]/(NUM_ATOMS*3)); mixed totalError = SQRT(error[0]/(numAtoms*3));
mixed newStepSize = SQRT(errorTol/totalError); mixed newStepSize = SQRT(errorTol/totalError);
mixed oldStepSize = dt[0].y; mixed oldStepSize = dt[0].y;
if (oldStepSize > 0.0f) if (oldStepSize > 0.0f)
......
...@@ -2,13 +2,13 @@ ...@@ -2,13 +2,13 @@
* Perform the first step of Verlet integration. * Perform the first step of Verlet integration.
*/ */
extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, const real4* __restrict__ posq, extern "C" __global__ void integrateVerletPart1(int numAtoms, int paddedNumAtoms, const mixed2* __restrict__ dt, const real4* __restrict__ posq,
const real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta) { const real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta) {
const mixed2 stepSize = dt[0]; const mixed2 stepSize = dt[0];
const mixed dtPos = stepSize.y; const mixed dtPos = stepSize.y;
const mixed dtVel = 0.5f*(stepSize.x+stepSize.y); const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
const mixed scale = dtVel/(mixed) 0x100000000; const mixed scale = dtVel/(mixed) 0x100000000;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
mixed4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0.0) { if (velocity.w != 0.0) {
#ifdef USE_MIXED_PRECISION #ifdef USE_MIXED_PRECISION
...@@ -19,8 +19,8 @@ extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, c ...@@ -19,8 +19,8 @@ extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, c
real4 pos = posq[index]; real4 pos = posq[index];
#endif #endif
velocity.x += scale*force[index]*velocity.w; velocity.x += scale*force[index]*velocity.w;
velocity.y += scale*force[index+PADDED_NUM_ATOMS]*velocity.w; velocity.y += scale*force[index+paddedNumAtoms]*velocity.w;
velocity.z += scale*force[index+PADDED_NUM_ATOMS*2]*velocity.w; velocity.z += scale*force[index+paddedNumAtoms*2]*velocity.w;
pos.x = velocity.x*dtPos; pos.x = velocity.x*dtPos;
pos.y = velocity.y*dtPos; pos.y = velocity.y*dtPos;
pos.z = velocity.z*dtPos; pos.z = velocity.z*dtPos;
...@@ -34,7 +34,7 @@ extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, c ...@@ -34,7 +34,7 @@ extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, c
* Perform the second step of Verlet integration. * Perform the second step of Verlet integration.
*/ */
extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4* __restrict__ posq, extern "C" __global__ void integrateVerletPart2(int numAtoms, mixed2* __restrict__ dt, real4* __restrict__ posq,
real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) { real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) {
mixed2 stepSize = dt[0]; mixed2 stepSize = dt[0];
#if __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
...@@ -46,7 +46,7 @@ extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4* ...@@ -46,7 +46,7 @@ extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4*
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index == 0) if (index == 0)
dt[0].x = stepSize.y; dt[0].x = stepSize.y;
for (; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (; index < numAtoms; index += blockDim.x*gridDim.x) {
mixed4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0.0) { if (velocity.w != 0.0) {
#ifdef USE_MIXED_PRECISION #ifdef USE_MIXED_PRECISION
...@@ -80,14 +80,14 @@ extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4* ...@@ -80,14 +80,14 @@ extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4*
* Select the step size to use for the next step. * Select the step size to use for the next step.
*/ */
extern "C" __global__ void selectVerletStepSize(mixed maxStepSize, mixed errorTol, mixed2* __restrict__ dt, const mixed4* __restrict__ velm, const long long* __restrict__ force) { extern "C" __global__ void selectVerletStepSize(int numAtoms, int paddedNumAtoms, mixed maxStepSize, mixed errorTol, mixed2* __restrict__ dt, const mixed4* __restrict__ velm, const long long* __restrict__ force) {
// Calculate the error. // Calculate the error.
extern __shared__ mixed error[]; extern __shared__ mixed error[];
mixed err = 0.0f; mixed err = 0.0f;
const mixed scale = RECIP((mixed) 0x100000000); const mixed scale = RECIP((mixed) 0x100000000);
for (int index = threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
mixed3 f = make_mixed3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]); mixed3 f = make_mixed3(scale*force[index], scale*force[index+paddedNumAtoms], scale*force[index+paddedNumAtoms*2]);
mixed invMass = velm[index].w; mixed invMass = velm[index].w;
err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass; err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
} }
...@@ -102,7 +102,7 @@ extern "C" __global__ void selectVerletStepSize(mixed maxStepSize, mixed errorTo ...@@ -102,7 +102,7 @@ extern "C" __global__ void selectVerletStepSize(mixed maxStepSize, mixed errorTo
__syncthreads(); __syncthreads();
} }
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
mixed totalError = SQRT(error[0]/(NUM_ATOMS*3)); mixed totalError = SQRT(error[0]/(numAtoms*3));
mixed newStepSize = SQRT(errorTol/totalError); mixed newStepSize = SQRT(errorTol/totalError);
mixed oldStepSize = dt[0].y; mixed oldStepSize = dt[0].y;
if (oldStepSize > 0.0f) if (oldStepSize > 0.0f)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment