Commit 18fb6efc authored by Peter Eastman's avatar Peter Eastman
Browse files

Implemented mixed precision mode

parent 76716f1c
...@@ -1977,7 +1977,20 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor ...@@ -1977,7 +1977,20 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks()); defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
defines["FORCE_WORK_GROUP_SIZE"] = cu.intToString(nb.getForceThreadBlockSize()); defines["FORCE_WORK_GROUP_SIZE"] = cu.intToString(nb.getForceThreadBlockSize());
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::gbsaObc1, defines); map<string, string> replacements;
stringstream defineAccum;
if (cu.getAccumulateInDouble()) {
defineAccum << "typedef double accum;\n";
defineAccum << "typedef double4 accum4;\n";
defines["make_accum4"] = "make_double4";
}
else {
defineAccum << "typedef real accum;\n";
defineAccum << "typedef real4 accum4;\n";
defines["make_accum4"] = "make_real4";
}
replacements["DEFINE_ACCUM"] = defineAccum.str();
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::gbsaObc1, replacements), defines);
computeBornSumKernel = cu.getKernel(module, "computeBornSum"); computeBornSumKernel = cu.getKernel(module, "computeBornSum");
computeSumArgs.push_back(&bornSum->getDevicePointer()); computeSumArgs.push_back(&bornSum->getDevicePointer());
computeSumArgs.push_back(&cu.getPosq().getDevicePointer()); computeSumArgs.push_back(&cu.getPosq().getDevicePointer());
...@@ -2422,7 +2435,7 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG ...@@ -2422,7 +2435,7 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
map<string, string> replacements; map<string, string> replacements;
string n2EnergyStr = n2EnergySource.str(); string n2EnergyStr = n2EnergySource.str();
replacements["COMPUTE_INTERACTION"] = n2EnergyStr; replacements["COMPUTE_INTERACTION"] = n2EnergyStr;
stringstream extraArgs, atomParams, loadLocal1, loadLocal2, clearLocal, load1, load2, declare1, recordDeriv, storeDerivs1, storeDerivs2, declareTemps, setTemps; stringstream extraArgs, atomParams, loadLocal1, loadLocal2, clearLocal, load1, load2, declare1, recordDeriv, storeDerivs1, storeDerivs2;
if (force.getNumGlobalParameters() > 0) if (force.getNumGlobalParameters() > 0)
extraArgs << ", const float* globals"; extraArgs << ", const float* globals";
pairEnergyUsesParam.resize(params->getBuffers().size(), false); pairEnergyUsesParam.resize(params->getBuffers().size(), false);
...@@ -2459,15 +2472,13 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG ...@@ -2459,15 +2472,13 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
extraArgs << ", unsigned long long* __restrict__ derivBuffers"; extraArgs << ", unsigned long long* __restrict__ derivBuffers";
for (int i = 0; i < force.getNumComputedValues(); i++) { for (int i = 0; i < force.getNumComputedValues(); i++) {
string index = cu.intToString(i+1); string index = cu.intToString(i+1);
atomParams << "real deriv" << index << ";\n"; atomParams << "accum deriv" << index << ";\n";
clearLocal << "localData[localAtomIndex].deriv" << index << " = 0;\n"; clearLocal << "localData[localAtomIndex].deriv" << index << " = 0;\n";
declare1 << "real deriv" << index << "_1 = 0;\n"; declare1 << "accum deriv" << index << "_1 = 0;\n";
load2 << "real deriv" << index << "_2 = 0;\n"; load2 << "real deriv" << index << "_2 = 0;\n";
recordDeriv << "localData[atom2].deriv" << index << " += deriv" << index << "_2;\n"; recordDeriv << "localData[atom2].deriv" << index << " += deriv" << index << "_2;\n";
storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")\n"; storeDerivs1 << "STORE_DERIVATIVE_1(" << index << ")\n";
storeDerivs2 << "STORE_DERIVATIVE_2(" << index << ")\n"; storeDerivs2 << "STORE_DERIVATIVE_2(" << index << ")\n";
declareTemps << "__local real tempDerivBuffer" << index << "[64];\n";
setTemps << "tempDerivBuffer" << index << "[threadIdx.x] = deriv" << index << "_1;\n";
atomParamSize++; atomParamSize++;
} }
replacements["PARAMETER_ARGUMENTS"] = extraArgs.str()+tableArgs.str(); replacements["PARAMETER_ARGUMENTS"] = extraArgs.str()+tableArgs.str();
...@@ -2481,9 +2492,19 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG ...@@ -2481,9 +2492,19 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
replacements["RECORD_DERIVATIVE_2"] = recordDeriv.str(); replacements["RECORD_DERIVATIVE_2"] = recordDeriv.str();
replacements["STORE_DERIVATIVES_1"] = storeDerivs1.str(); replacements["STORE_DERIVATIVES_1"] = storeDerivs1.str();
replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str(); replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str();
replacements["DECLARE_TEMP_BUFFERS"] = declareTemps.str();
replacements["SET_TEMP_BUFFERS"] = setTemps.str();
map<string, string> defines; map<string, string> defines;
stringstream defineAccum;
if (cu.getAccumulateInDouble()) {
defineAccum << "typedef double accum;\n";
defineAccum << "typedef double3 accum3;\n";
defines["make_accum3"] = "make_double3";
}
else {
defineAccum << "typedef real accum;\n";
defineAccum << "typedef real3 accum3;\n";
defines["make_accum3"] = "make_real3";
}
replacements["DEFINE_ACCUM"] = defineAccum.str();
if (useCutoff) if (useCutoff)
defines["USE_CUTOFF"] = "1"; defines["USE_CUTOFF"] = "1";
if (usePeriodic) if (usePeriodic)
......
...@@ -445,10 +445,22 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -445,10 +445,22 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms()); defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks()); defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision()) if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision() && !context.getAccumulateInDouble())
defines["PARAMETER_SIZE_IS_EVEN"] = "1"; defines["PARAMETER_SIZE_IS_EVEN"] = "1";
if (context.getComputeCapability() >= 3.0 && !context.getUseDoublePrecision()) if (context.getComputeCapability() >= 3.0 && !context.getUseDoublePrecision())
defines["ENABLE_SHUFFLE"] = "1"; defines["ENABLE_SHUFFLE"] = "1";
stringstream defineAccum;
if (context.getAccumulateInDouble()) {
defineAccum << "typedef double accum;\n";
defineAccum << "typedef double3 accum3;\n";
defines["make_accum3"] = "make_double3";
}
else {
defineAccum << "typedef real accum;\n";
defineAccum << "typedef real3 accum3;\n";
defines["make_accum3"] = "make_real3";
}
replacements["DEFINE_ACCUM"] = defineAccum.str();
CUmodule program = context.createModule(CudaKernelSources::vectorOps+context.replaceStrings(CudaKernelSources::nonbonded, replacements), defines); CUmodule program = context.createModule(CudaKernelSources::vectorOps+context.replaceStrings(CudaKernelSources::nonbonded, replacements), defines);
CUfunction kernel = context.getKernel(program, "computeNonbonded"); CUfunction kernel = context.getKernel(program, "computeNonbonded");
......
...@@ -88,12 +88,12 @@ CudaPlatform::CudaPlatform() { ...@@ -88,12 +88,12 @@ CudaPlatform::CudaPlatform() {
#ifdef _MSC_VER #ifdef _MSC_VER
char* bindir = getenv("CUDA_BIN_PATH"); char* bindir = getenv("CUDA_BIN_PATH");
string nvcc = (bindir == NULL ? "nvcc.exe" : string(bindir)+"\\nvcc.exe"); string nvcc = (bindir == NULL ? "nvcc.exe" : string(bindir)+"\\nvcc.exe");
int length = GetShortPathName(nvcc.c_str(), NULL, 0); int length = GetShortPathName(nvcc.c_str(), NULL, 0);
if (length > 0) { if (length > 0) {
vector<char> shortName(length); vector<char> shortName(length);
GetShortPathName(nvcc.c_str(), &shortName[0], length); GetShortPathName(nvcc.c_str(), &shortName[0], length);
nvcc = string(&shortName[0]); nvcc = string(&shortName[0]);
} }
setPropertyDefaultValue(CudaCompiler(), nvcc); setPropertyDefaultValue(CudaCompiler(), nvcc);
setPropertyDefaultValue(CudaTempDirectory(), string(getenv("TEMP"))); setPropertyDefaultValue(CudaTempDirectory(), string(getenv("TEMP")));
#else #else
......
...@@ -2,10 +2,11 @@ ...@@ -2,10 +2,11 @@
#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0xFFFFFFFF))); #define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0xFFFFFFFF)));
#define TILE_SIZE 32 #define TILE_SIZE 32
DEFINE_ACCUM
typedef struct { typedef struct {
real4 posq; real4 posq;
real3 force; accum3 force;
ATOM_PARAMETER_DATA ATOM_PARAMETER_DATA
#ifdef NEED_PADDING #ifdef NEED_PADDING
float padding; float padding;
...@@ -46,7 +47,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc ...@@ -46,7 +47,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
const unsigned int tbx = threadIdx.x - tgx; const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE; const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
unsigned int x, y; unsigned int x, y;
real3 force = make_real3(0); accum3 force = make_accum3(0);
DECLARE_ATOM1_DERIVATIVES DECLARE_ATOM1_DERIVATIVES
if (pos < end) { if (pos < end) {
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
...@@ -122,7 +123,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc ...@@ -122,7 +123,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
} }
energy += 0.5f*tempEnergy; energy += 0.5f*tempEnergy;
delta *= dEdR; delta *= dEdR;
force -= delta; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif #endif
...@@ -140,7 +143,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc ...@@ -140,7 +143,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
localData[localAtomIndex].posq = posq[j]; localData[localAtomIndex].posq = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
} }
localData[localAtomIndex].force = make_real3(0); localData[localAtomIndex].force = make_accum3(0);
CLEAR_LOCAL_DERIVATIVES CLEAR_LOCAL_DERIVATIVES
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF); unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
...@@ -185,9 +188,13 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc ...@@ -185,9 +188,13 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
} }
energy += tempEnergy; energy += tempEnergy;
delta *= dEdR; delta *= dEdR;
force -= delta; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = tbx+tj; atom2 = tbx+tj;
localData[atom2].force += delta; localData[atom2].force.x += delta.x;
localData[atom2].force.y += delta.y;
localData[atom2].force.z += delta.z;
RECORD_DERIVATIVE_2 RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
......
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#define TILE_SIZE 32 #define TILE_SIZE 32
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE) #define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
DEFINE_ACCUM
/** /**
* Reduce the Born sums to compute the Born radii. * Reduce the Born sums to compute the Born radii.
*/ */
...@@ -331,7 +333,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa ...@@ -331,7 +333,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
typedef struct { typedef struct {
real x, y, z; real x, y, z;
real q; real q;
real fx, fy, fz, fw; accum fx, fy, fz, fw;
real bornRadius; real bornRadius;
} AtomData2; } AtomData2;
...@@ -372,7 +374,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo ...@@ -372,7 +374,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
const unsigned int tbx = threadIdx.x - tgx; const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE; const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
unsigned int x, y; unsigned int x, y;
real4 force = make_real4(0); accum4 force = make_accum4(0);
if (pos < end) { if (pos < end) {
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
......
#define TILE_SIZE 32 #define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE) #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
DEFINE_ACCUM
typedef struct { typedef struct {
real x, y, z; real x, y, z;
real q; real q;
real fx, fy, fz; accum fx, fy, fz;
ATOM_PARAMETER_DATA ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN #ifndef PARAMETER_SIZE_IS_EVEN
real padding; real padding;
...@@ -47,7 +49,7 @@ extern "C" __global__ void computeNonbonded( ...@@ -47,7 +49,7 @@ extern "C" __global__ void computeNonbonded(
const unsigned int tbx = threadIdx.x - tgx; const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE; const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
unsigned int x, y; unsigned int x, y;
real3 force = make_real3(0); accum3 force = make_accum3(0);
if (pos < end) { if (pos < end) {
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
...@@ -124,9 +126,13 @@ extern "C" __global__ void computeNonbonded( ...@@ -124,9 +126,13 @@ extern "C" __global__ void computeNonbonded(
COMPUTE_INTERACTION COMPUTE_INTERACTION
energy += 0.5f*tempEnergy; energy += 0.5f*tempEnergy;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
force -= delta*dEdR; force.x -= delta.x*dEdR;
force.y -= delta.y*dEdR;
force.z -= delta.z*dEdR;
#else #else
force -= dEdR1; force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
#endif #endif
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
excl >>= 1; excl >>= 1;
...@@ -191,7 +197,9 @@ extern "C" __global__ void computeNonbonded( ...@@ -191,7 +197,9 @@ extern "C" __global__ void computeNonbonded(
#ifdef ENABLE_SHUFFLE #ifdef ENABLE_SHUFFLE
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
delta *= dEdR; delta *= dEdR;
force -= delta; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
for (int i = 16; i >= 1; i /= 2) { for (int i = 16; i >= 1; i /= 2) {
delta.x += __shfl_xor(delta.x, i, 32); delta.x += __shfl_xor(delta.x, i, 32);
delta.y += __shfl_xor(delta.y, i, 32); delta.y += __shfl_xor(delta.y, i, 32);
...@@ -203,7 +211,9 @@ extern "C" __global__ void computeNonbonded( ...@@ -203,7 +211,9 @@ extern "C" __global__ void computeNonbonded(
localData[tbx+j].fz += delta.z; localData[tbx+j].fz += delta.z;
} }
#else #else
force -= dEdR1; force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
for (int i = 16; i >= 1; i /= 2) { for (int i = 16; i >= 1; i /= 2) {
dEdR2.x += __shfl_xor(dEdR2.x, i, 32); dEdR2.x += __shfl_xor(dEdR2.x, i, 32);
dEdR2.y += __shfl_xor(dEdR2.y, i, 32); dEdR2.y += __shfl_xor(dEdR2.y, i, 32);
...@@ -218,12 +228,16 @@ extern "C" __global__ void computeNonbonded( ...@@ -218,12 +228,16 @@ extern "C" __global__ void computeNonbonded(
#else #else
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
delta *= dEdR; delta *= dEdR;
force -= delta; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
tempBuffer[bufferIndex] = delta.x; tempBuffer[bufferIndex] = delta.x;
tempBuffer[bufferIndex+1] = delta.y; tempBuffer[bufferIndex+1] = delta.y;
tempBuffer[bufferIndex+2] = delta.z; tempBuffer[bufferIndex+2] = delta.z;
#else #else
force -= dEdR1; force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
tempBuffer[bufferIndex] = dEdR2.x; tempBuffer[bufferIndex] = dEdR2.x;
tempBuffer[bufferIndex+1] = dEdR2.y; tempBuffer[bufferIndex+1] = dEdR2.y;
tempBuffer[bufferIndex+2] = dEdR2.z; tempBuffer[bufferIndex+2] = dEdR2.z;
...@@ -287,12 +301,16 @@ extern "C" __global__ void computeNonbonded( ...@@ -287,12 +301,16 @@ extern "C" __global__ void computeNonbonded(
energy += tempEnergy; energy += tempEnergy;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
delta *= dEdR; delta *= dEdR;
force -= delta; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[tbx+tj].fx += delta.x; localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y; localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z; localData[tbx+tj].fz += delta.z;
#else #else
force -= dEdR1; force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
localData[tbx+tj].fx += dEdR2.x; localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y; localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z; localData[tbx+tj].fz += dEdR2.z;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment