Commit 44665537 authored by Yutong Zhao's avatar Yutong Zhao
Browse files

first working version with shuffle - but breaks lots of existing tests

parent faf5e0fe
...@@ -394,7 +394,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -394,7 +394,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
// Write out the source to a temporary file. // Write out the source to a temporary file.
stringstream tempFileName; stringstream tempFileName;
tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions. tempFileName << "openmmTempKernel" << /*rand() <<*/ this; // Include a pointer to this context as part of the filename to avoid collisions.
string inputFile = (tempDir+tempFileName.str()+".cu"); string inputFile = (tempDir+tempFileName.str()+".cu");
string outputFile = (tempDir+tempFileName.str()+".ptx"); string outputFile = (tempDir+tempFileName.str()+".ptx");
string logFile = (tempDir+tempFileName.str()+".log"); string logFile = (tempDir+tempFileName.str()+".log");
...@@ -428,6 +428,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -428,6 +428,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
} }
log.close(); log.close();
} }
cout << error.str() << endl;
throw OpenMMException(error.str()); throw OpenMMException(error.str());
} }
CUmodule module; CUmodule module;
...@@ -437,15 +438,15 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -437,15 +438,15 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
m<<"Error loading CUDA module: "<<getErrorString(result)<<" ("<<result<<")"; m<<"Error loading CUDA module: "<<getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(m.str()); throw OpenMMException(m.str());
} }
remove(inputFile.c_str()); //remove(inputFile.c_str());
remove(outputFile.c_str()); //remove(outputFile.c_str());
remove(logFile.c_str()); //remove(logFile.c_str());
return module; return module;
} }
catch (...) { catch (...) {
remove(inputFile.c_str()); //remove(inputFile.c_str());
remove(outputFile.c_str()); //remove(outputFile.c_str());
remove(logFile.c_str()); //remove(logFile.c_str());
throw; throw;
} }
} }
......
...@@ -445,6 +445,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -445,6 +445,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
args << arguments[i].getName(); args << arguments[i].getName();
} }
replacements["PARAMETER_ARGUMENTS"] = args.str(); replacements["PARAMETER_ARGUMENTS"] = args.str();
/*
stringstream loadLocal1; stringstream loadLocal1;
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1) { if (params[i].getNumComponents() == 1) {
...@@ -456,6 +458,17 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -456,6 +458,17 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
} }
} }
replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str(); replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
*/
stringstream loadLocal1;
loadLocal1 << "tempSigmaEpsilon = sigmaEpsilon1;" << endl;
//for (int i = 0; i < (int) params.size(); i++) {
// loadLocal1<<params[i].getType()<<" temp"<<params[i].getName()<<"="<<params[i].getName()<<"1;\n";
//}
//cout << loadLocal1.str() << endl;
replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
/*
stringstream loadLocal2; stringstream loadLocal2;
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1) { if (params[i].getNumComponents() == 1) {
...@@ -468,6 +481,40 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -468,6 +481,40 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
} }
} }
replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str(); replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
*/
stringstream declareLocal2;
for(int i=0; i< (int) params.size(); i++) {
if (params[i].getNumComponents() == 1) {
// loadLocal2<<params[i].getType()<<" "<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
} else {
declareLocal2<<params[i].getType()<<" temp"<<params[i].getName()<<";\n";
}
}
replacements["DECLARE_LOCAL_PARAMETERS"] = declareLocal2.str();
stringstream loadLocal2;
for(int i=0; i< (int) params.size(); i++) {
if (params[i].getNumComponents() == 1) {
// loadLocal2<<params[i].getType()<<" "<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
} else {
loadLocal2<<"temp"<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
}
}
/*
for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1) {
loadLocal2<<params[i].getType()<<" "<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
}
else {
loadLocal2<<params[i].getType()<<" temp_"<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
for (int j = 0; j < params[i].getNumComponents(); ++j)
loadLocal2<<params[i].getType()<<" "<<params[i].getName()<<"_"<<suffixes[j]<<" = temp_"<<params[i].getName()<<"."<<suffixes[j]<<";\n";
}
}
*/
replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
stringstream load1; stringstream load1;
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
load1 << params[i].getType(); load1 << params[i].getType();
...@@ -478,6 +525,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -478,6 +525,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
load1 << "[atom1];\n"; load1 << "[atom1];\n";
} }
replacements["LOAD_ATOM1_PARAMETERS"] = load1.str(); replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
/*
stringstream load2j; stringstream load2j;
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1) { if (params[i].getNumComponents() == 1) {
...@@ -494,6 +543,65 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -494,6 +543,65 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
} }
} }
replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str(); replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
*/
stringstream load2j;
for (int i = 0; i < (int) params.size(); i++) {
/*
if (params[i].getNumComponents() == 1) {
load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = "<<params[i].getName()<<";\n";
}
else {
load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = make_"<<params[i].getType()<<"(";
for (int j = 0; j < params[i].getNumComponents(); ++j) {
if (j > 0)
load2j<<", ";
load2j<<params[i].getName()<<"_"<<suffixes[j];
}
load2j<<");\n";
}*/
load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = temp"<<params[i].getName()<<";\n";
}
replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
stringstream broadcastWarpData;
broadcastWarpData << "posq2.x = __shfl(tempPosq.x, j);\n";
broadcastWarpData << "posq2.y = __shfl(tempPosq.y, j);\n";
broadcastWarpData << "posq2.z = __shfl(tempPosq.z, j);\n";
broadcastWarpData << "posq2.w = __shfl(tempPosq.w, j);\n";
for(int i=0; i< (int) params.size();i++) {
broadcastWarpData << params[i].getType() << " temp" << params[i].getName() << ";\n";
for(int j=0; j < params[i].getNumComponents(); j++) {
string name;
if (params[i].getNumComponents() == 1) {
broadcastWarpData << "temp" << params[i].getName() << "=__shfl(" << params[i].getName() <<"1,j);\n";
} else {
broadcastWarpData << "temp" << params[i].getName()+"."+suffixes[j] << "=__shfl(" << params[i].getName()+"1."+suffixes[j] <<",j);\n";
}
}
}
replacements["BROADCAST_WARP_DATA"] = broadcastWarpData.str();
stringstream shuffleWarpData;
shuffleWarpData << "tempPosq.x = __shfl(tempPosq.x, tgx+1);\n";
shuffleWarpData << "tempPosq.y = __shfl(tempPosq.y, tgx+1);\n";
shuffleWarpData << "tempPosq.z = __shfl(tempPosq.z, tgx+1);\n";
shuffleWarpData << "tempPosq.w = __shfl(tempPosq.w, tgx+1);\n";
shuffleWarpData << "tempForces.x = __shfl(tempForces.x, tgx+1);\n";
shuffleWarpData << "tempForces.y = __shfl(tempForces.y, tgx+1);\n";
shuffleWarpData << "tempForces.z = __shfl(tempForces.z, tgx+1);\n";
shuffleWarpData << "tempsigmaEpsilon.x = __shfl(tempsigmaEpsilon.x, tgx+1);\n";
shuffleWarpData << "tempsigmaEpsilon.y = __shfl(tempsigmaEpsilon.y, tgx+1);\n";
/*
for(int i=0; i< (int) params.size(); i++) {
shuffleWarpData << params[i].getName() << "=__shfl(" << params[i].getName() << ", tgx+1);\n";
}
*/
replacements["SHUFFLE_WARP_DATA"] = shuffleWarpData.str();
map<string, string> defines; map<string, string> defines;
if (useCutoff) if (useCutoff)
defines["USE_CUTOFF"] = "1"; defines["USE_CUTOFF"] = "1";
......
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE) #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
// structs are aligned to host compiler rules by default.
// large structures can spill into cache if using registers.
// this would defeat the purpose of using shuffles!
typedef struct { typedef struct {
real x, y, z; real x, y, z;
real q; real q;
...@@ -16,7 +19,10 @@ typedef struct { ...@@ -16,7 +19,10 @@ typedef struct {
* implicit warp-level synchronization. A tile is defined by two atom blocks * implicit warp-level synchronization. A tile is defined by two atom blocks
* each of warpsize. Each warp computes a range of tiles. * each of warpsize. Each warp computes a range of tiles.
* *
* Tiles with exclusions compute the entire set of interactions across * On-diagonal tiles processes interaction using a naive all-against-one interaction
* accumulation scheme.
*
* Off-diagonal tiles with exclusions compute the entire set of interactions across
* atom blocks, equal to warpsize*warpsize. In order to avoid access conflicts * atom blocks, equal to warpsize*warpsize. In order to avoid access conflicts
* the forces are computed and accumulated diagonally in the manner shown below * the forces are computed and accumulated diagonally in the manner shown below
* where, suppose * where, suppose
...@@ -43,7 +49,7 @@ typedef struct { ...@@ -43,7 +49,7 @@ typedef struct {
* TODO: Implement shuffle as opposed to using nonbonded. * TODO: Implement shuffle as opposed to using nonbonded.
* *
* Tiles without exclusions read off directly from the neighbourlist interactingAtoms * Tiles without exclusions read off directly from the neighbourlist interactingAtoms
* and follows the same force accumulation method. If more there are more interactingTiles * and follows the same force accumulation method above. If more there are more interactingTiles
* than the size of the neighbourlist initially allocated, the neighbourlist is rebuilt * than the size of the neighbourlist initially allocated, the neighbourlist is rebuilt
* and the full tileset. * and the full tileset.
* *
...@@ -83,9 +89,6 @@ extern "C" __global__ void computeNonbonded( ...@@ -83,9 +89,6 @@ extern "C" __global__ void computeNonbonded(
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = threadIdx.x - tgx; // block warpIndex const unsigned int tbx = threadIdx.x - tgx; // block warpIndex
real energy = 0.0f; real energy = 0.0f;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
...@@ -96,23 +99,29 @@ extern "C" __global__ void computeNonbonded( ...@@ -96,23 +99,29 @@ extern "C" __global__ void computeNonbonded(
real3 force = make_real3(0); real3 force = make_real3(0);
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
tileflags excl = exclusions[pos*TILE_SIZE+tgx]; tileflags excl = exclusions[pos*TILE_SIZE+tgx];
#endif #endif
const bool hasExclusions = true; const bool hasExclusions = true;
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x; const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].x = posq1.x; real4 tempPosq = posq1;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z; // we do not need to fetch parameters from global since this is a symmetric tile
localData[localAtomIndex].q = posq1.w; // instead we can broadcast the values using shuffle
LOAD_LOCAL_PARAMETERS_FROM_1 // LOAD_LOCAL_PARAMETERS_FROM_1
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+j; int atom2 = tbx+j;
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q); real4 posq2;
// load in the data from other registers
BROADCAST_WARP_DATA
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
...@@ -150,27 +159,25 @@ extern "C" __global__ void computeNonbonded( ...@@ -150,27 +159,25 @@ extern "C" __global__ void computeNonbonded(
#endif #endif
} }
} }
else { else { // This is an off-diagonal tile.
// This is an off-diagonal tile.
const unsigned int localAtomIndex = threadIdx.x; const unsigned int localAtomIndex = threadIdx.x;
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j]; real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y; real3 tempForces;
localData[localAtomIndex].z = tempPosq.z; tempForces.x = 0.0f;
localData[localAtomIndex].q = tempPosq.w; tempForces.y = 0.0f;
tempForces.z = 0.0f;
DECLARE_LOCAL_PARAMETERS
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].fx = 0.0f;
localData[localAtomIndex].fy = 0.0f;
localData[localAtomIndex].fz = 0.0f;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx)); excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif #endif
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj; int atom2 = tbx+tj;
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q); real4 posq2 = tempPosq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
...@@ -178,65 +185,72 @@ extern "C" __global__ void computeNonbonded( ...@@ -178,65 +185,72 @@ extern "C" __global__ void computeNonbonded(
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { #ifdef USE_CUTOFF
#endif if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2); #endif
real r = RECIP(invR); real invR = RSQRT(r2);
LOAD_ATOM2_PARAMETERS real r = RECIP(invR);
atom2 = y*TILE_SIZE+tj; LOAD_ATOM2_PARAMETERS
#ifdef USE_SYMMETRIC atom2 = y*TILE_SIZE+tj;
real dEdR = 0.0f; #ifdef USE_SYMMETRIC
#else real dEdR = 0.0f;
real3 dEdR1 = make_real3(0); #else
real3 dEdR2 = make_real3(0); real3 dEdR1 = make_real3(0);
#endif real3 dEdR2 = make_real3(0);
#ifdef USE_EXCLUSIONS #endif
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1)); #ifdef USE_EXCLUSIONS
#endif bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
real tempEnergy = 0.0f; #endif
COMPUTE_INTERACTION real tempEnergy = 0.0f;
energy += tempEnergy; COMPUTE_INTERACTION
#ifdef USE_SYMMETRIC energy += tempEnergy;
delta *= dEdR; #ifdef USE_SYMMETRIC
force.x -= delta.x; delta *= dEdR;
force.y -= delta.y; force.x -= delta.x;
force.z -= delta.z; force.y -= delta.y;
localData[tbx+tj].fx += delta.x; force.z -= delta.z;
localData[tbx+tj].fy += delta.y; tempForces.x += delta.x;
localData[tbx+tj].fz += delta.z; tempForces.y += delta.y;
#else tempForces.z += delta.z;
force.x -= dEdR1.x; #else
force.y -= dEdR1.y; force.x -= dEdR1.x;
force.z -= dEdR1.z; force.y -= dEdR1.y;
localData[tbx+tj].fx += dEdR2.x; force.z -= dEdR1.z;
localData[tbx+tj].fy += dEdR2.y; tempForces.x += dEdR2.x;
localData[tbx+tj].fz += dEdR2.z; tempForces.y += dEdR2.y;
#endif tempForces.z += dEdR2.z;
#ifdef USE_CUTOFF #endif
} #ifdef USE_CUTOFF
}
#endif #endif
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
excl >>= 1; excl >>= 1;
#endif #endif
// cycles the indices // cycles the indices
// 0 1 2 3 4 5 6 7 -> 1 2 3 4 5 6 7 0 // 0 1 2 3 4 5 6 7 -> 1 2 3 4 5 6 7 0
SHUFFLE_WARP_DATA
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
} }
}
// Write results. unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (tempForces.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (tempForces.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (tempForces.z*0x100000000)));
}
unsigned int offset = x*TILE_SIZE + tgx; unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000))); atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000))); atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000))); atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
if (x != y) { //if (x != y) {
offset = y*TILE_SIZE + tgx; // offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000))); // atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (tempForces.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000))); // atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (tempForces.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000))); // atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (tempForces.z*0x100000000)));
} //}
} }
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
...@@ -304,8 +318,7 @@ extern "C" __global__ void computeNonbonded( ...@@ -304,8 +318,7 @@ extern "C" __global__ void computeNonbonded(
if (includeTile) { if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile. // Load atom data for this tile.
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
const unsigned int localAtomIndex = threadIdx.x; const unsigned int localAtomIndex = threadIdx.x;
...@@ -315,17 +328,26 @@ extern "C" __global__ void computeNonbonded( ...@@ -315,17 +328,26 @@ extern "C" __global__ void computeNonbonded(
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
#endif #endif
atomIndices[threadIdx.x] = j; atomIndices[threadIdx.x] = j;
real4 tempPosq;
real3 tempForces;
tempForces.x = 0.0f;
tempForces.y = 0.0f;
tempForces.z = 0.0f;
DECLARE_LOCAL_PARAMETERS
if (j < PADDED_NUM_ATOMS) { if (j < PADDED_NUM_ATOMS) {
// Load position of atom j from from global memory // Load position of atom j from from global memory
real4 tempPosq = posq[j]; tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y; //localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].z = tempPosq.z; //localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].q = tempPosq.w; //localData[localAtomIndex].z = tempPosq.z;
//localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].fx = 0.0f; //localData[localAtomIndex].fx = 0.0f;
localData[localAtomIndex].fy = 0.0f; //localData[localAtomIndex].fy = 0.0f;
localData[localAtomIndex].fz = 0.0f; //localData[localAtomIndex].fz = 0.0f;
} }
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
if (singlePeriodicCopy) { if (singlePeriodicCopy) {
...@@ -336,61 +358,67 @@ extern "C" __global__ void computeNonbonded( ...@@ -336,61 +358,67 @@ extern "C" __global__ void computeNonbonded(
posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
localData[localAtomIndex].x -= floor((localData[localAtomIndex].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[localAtomIndex].y -= floor((localData[localAtomIndex].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; //localData[localAtomIndex].x -= floor((localData[localAtomIndex].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[localAtomIndex].z -= floor((localData[localAtomIndex].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; //localData[localAtomIndex].y -= floor((localData[localAtomIndex].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
//localData[localAtomIndex].z -= floor((localData[localAtomIndex].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
tempPosq.x -= floor((tempPosq.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
tempPosq.y -= floor((tempPosq.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
tempPosq.z -= floor((tempPosq.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj; int atom2 = tbx+tj;
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q); real4 posq2 = tempPosq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = RECIP(invR); real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[tbx+tj]; atom2 = atomIndices[tbx+tj];
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
real dEdR = 0.0f; real dEdR = 0.0f;
#else #else
real3 dEdR1 = make_real3(0); real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0); real3 dEdR2 = make_real3(0);
#endif #endif
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS); bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
#endif #endif
real tempEnergy = 0.0f; real tempEnergy = 0.0f;
COMPUTE_INTERACTION COMPUTE_INTERACTION
energy += tempEnergy; energy += tempEnergy;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
delta *= dEdR; delta *= dEdR;
force.x -= delta.x; force.x -= delta.x;
force.y -= delta.y; force.y -= delta.y;
force.z -= delta.z; force.z -= delta.z;
localData[tbx+tj].fx += delta.x; tempForces.x += delta.x;
localData[tbx+tj].fy += delta.y; tempForces.y += delta.y;
localData[tbx+tj].fz += delta.z; tempForces.z += delta.z;
#else #else
force.x -= dEdR1.x; force.x -= dEdR1.x;
force.y -= dEdR1.y; force.y -= dEdR1.y;
force.z -= dEdR1.z; force.z -= dEdR1.z;
localData[tbx+tj].fx += dEdR2.x; tempForces.x += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y; tempForces.y += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z; tempForces.z += dEdR2.z;
#endif #endif
} }
SHUFFLE_WARP_DATA
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
} }
} }
else else
#endif #endif
{ {
// We need to apply periodic boundary conditions separately for each interaction. // We need to apply periodic boundary conditions separately for each interaction.
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj; int atom2 = tbx+tj;
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q); real4 posq2 = tempPosq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
...@@ -398,44 +426,46 @@ extern "C" __global__ void computeNonbonded( ...@@ -398,44 +426,46 @@ extern "C" __global__ void computeNonbonded(
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { #ifdef USE_CUTOFF
#endif if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2); #endif
real r = RECIP(invR); real invR = RSQRT(r2);
LOAD_ATOM2_PARAMETERS real r = RECIP(invR);
atom2 = atomIndices[tbx+tj]; LOAD_ATOM2_PARAMETERS
#ifdef USE_SYMMETRIC atom2 = atomIndices[tbx+tj];
real dEdR = 0.0f; #ifdef USE_SYMMETRIC
#else real dEdR = 0.0f;
real3 dEdR1 = make_real3(0); #else
real3 dEdR2 = make_real3(0); real3 dEdR1 = make_real3(0);
#endif real3 dEdR2 = make_real3(0);
#ifdef USE_EXCLUSIONS #endif
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS); #ifdef USE_EXCLUSIONS
#endif bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
real tempEnergy = 0.0f; #endif
COMPUTE_INTERACTION real tempEnergy = 0.0f;
energy += tempEnergy; COMPUTE_INTERACTION
#ifdef USE_SYMMETRIC energy += tempEnergy;
delta *= dEdR; #ifdef USE_SYMMETRIC
force.x -= delta.x; delta *= dEdR;
force.y -= delta.y; force.x -= delta.x;
force.z -= delta.z; force.y -= delta.y;
localData[tbx+tj].fx += delta.x; force.z -= delta.z;
localData[tbx+tj].fy += delta.y; tempForces.x += delta.x;
localData[tbx+tj].fz += delta.z; tempForces.y += delta.y;
#else tempForces.z += delta.z;
force.x -= dEdR1.x; #else
force.y -= dEdR1.y; force.x -= dEdR1.x;
force.z -= dEdR1.z; force.y -= dEdR1.y;
localData[tbx+tj].fx += dEdR2.x; force.z -= dEdR1.z;
localData[tbx+tj].fy += dEdR2.y; tempForces.x += dEdR2.x;
localData[tbx+tj].fz += dEdR2.z; tempForces.y += dEdR2.y;
#endif tempForces.z += dEdR2.z;
#ifdef USE_CUTOFF #endif
} #ifdef USE_CUTOFF
}
#endif #endif
SHUFFLE_WARP_DATA
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
} }
} }
...@@ -451,12 +481,13 @@ extern "C" __global__ void computeNonbonded( ...@@ -451,12 +481,13 @@ extern "C" __global__ void computeNonbonded(
unsigned int atom2 = y*TILE_SIZE + tgx; unsigned int atom2 = y*TILE_SIZE + tgx;
#endif #endif
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000))); atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (tempForces.x*0x100000000)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000))); atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (tempForces.y*0x100000000)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000))); atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (tempForces.z*0x100000000)));
} }
} }
pos++; pos++;
} }
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy; energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
} }
...@@ -872,21 +872,21 @@ int main(int argc, char* argv[]) { ...@@ -872,21 +872,21 @@ int main(int argc, char* argv[]) {
try { try {
if (argc > 1) if (argc > 1)
platform.setPropertyDefaultValue("CudaPrecision", string(argv[1])); platform.setPropertyDefaultValue("CudaPrecision", string(argv[1]));
testCoulomb(); //testCoulomb();
testLJ(); //testLJ();
testExclusionsAnd14(); //testExclusionsAnd14();
testCutoff(); //testCutoff();
testCutoff14(); //testCutoff14();
testPeriodic(); //testPeriodic();
testLargeSystem(); testLargeSystem();
//testBlockInteractions(false); //testBlockInteractions(false);
//testBlockInteractions(true); //testBlockInteractions(true);
testDispersionCorrection(); //testDispersionCorrection();
testChangingParameters(); //testChangingParameters();
testParallelComputation(false); //testParallelComputation(false);
testParallelComputation(true); //testParallelComputation(true);
testSwitchingFunction(NonbondedForce::CutoffNonPeriodic); //testSwitchingFunction(NonbondedForce::CutoffNonPeriodic);
testSwitchingFunction(NonbondedForce::PME); //testSwitchingFunction(NonbondedForce::PME);
} }
catch(const exception& e) { catch(const exception& e) {
cout << "exception: " << e.what() << endl; cout << "exception: " << e.what() << endl;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment