Commit a409f0e8 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Update thread counts for Fermi-board

parent 80c69c93
......@@ -216,22 +216,44 @@ T& CUDAStream<T>::operator[](int index)
static const unsigned int GRID = 32;
static const unsigned int GRIDBITS = 5;
static const int G8X_BLOCKS_PER_SM = 1;
static const int GT2XX_BLOCKS_PER_SM = 1;
static const int GF1XX_BLOCKS_PER_SM = 3;
static const int G8X_NONBOND_THREADS_PER_BLOCK = 256;
static const int GT2XX_NONBOND_THREADS_PER_BLOCK = 320;
static const int GF1XX_NONBOND_THREADS_PER_BLOCK = 256;
//static const int GF1XX_NONBOND_THREADS_PER_BLOCK = 768;
static const int G8X_BORNFORCE2_THREADS_PER_BLOCK = 256;
static const int GT2XX_BORNFORCE2_THREADS_PER_BLOCK = 320;
static const int GF1XX_BORNFORCE2_THREADS_PER_BLOCK = 256;
//static const int GF1XX_BORNFORCE2_THREADS_PER_BLOCK = 768;
static const int G8X_SHAKE_THREADS_PER_BLOCK = 128;
static const int GT2XX_SHAKE_THREADS_PER_BLOCK = 256;
static const int GF1XX_SHAKE_THREADS_PER_BLOCK = 512;
static const int G8X_UPDATE_THREADS_PER_BLOCK = 192;
static const int GT2XX_UPDATE_THREADS_PER_BLOCK = 384;
static const int GF1XX_UPDATE_THREADS_PER_BLOCK = 768;
static const int G8X_LOCALFORCES_THREADS_PER_BLOCK = 192;
static const int GT2XX_LOCALFORCES_THREADS_PER_BLOCK = 384;
static const int GF1XX_LOCALFORCES_THREADS_PER_BLOCK = 768;
static const int G8X_THREADS_PER_BLOCK = 256;
static const int GT2XX_THREADS_PER_BLOCK = 256;
static const int GF1XX_THREADS_PER_BLOCK = 512;
static const int G8X_RANDOM_THREADS_PER_BLOCK = 256;
static const int GT2XX_RANDOM_THREADS_PER_BLOCK = 384;
static const int GF1XX_RANDOM_THREADS_PER_BLOCK = 768;
static const int G8X_NONBOND_WORKUNITS_PER_SM = 220;
static const int GT2XX_NONBOND_WORKUNITS_PER_SM = 256;
static const int GF1XX_NONBOND_WORKUNITS_PER_SM = 256;
static const unsigned int MAX_STACK_SIZE = 8;
static const unsigned int MAX_TABULATED_FUNCTIONS = 4;
......@@ -265,6 +287,7 @@ struct cudaGmxSimulation {
unsigned int atoms; // Number of atoms
unsigned int paddedNumberOfAtoms; // Padded number of atoms
unsigned int blocks; // Number of blocks to launch across linear kernels
unsigned int blocksPerSM; // Number of blocks per share memory
unsigned int nonbond_blocks; // Number of blocks to launch across CDLJ and Born Force Part1
unsigned int bornForce2_blocks; // Number of blocks to launch across Born Force 2
unsigned int interaction_blocks; // Number of blocks to launch when identifying interacting tiles
......
......@@ -1352,9 +1352,11 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
// Find connected constraints for CCMA.
vector<int> ccmaConstraints;
/*
for (unsigned i = 0; i < atom1.size(); i++)
if (!isShakeAtom[atom1[i]])
ccmaConstraints.push_back(i);
*/
// Record the connections between constraints.
......@@ -1832,11 +1834,13 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
gpu->sim.workUnitsPerSM = GT2XX_NONBOND_WORKUNITS_PER_SM;
break;
}
}
else
{
gpu->sm_version = SM_20;
gpu->sim.workUnitsPerSM = GF1XX_NONBOND_WORKUNITS_PER_SM;
}
gpu->sim.nonbond_blocks = deviceProp.multiProcessorCount;
gpu->sim.bornForce2_blocks = deviceProp.multiProcessorCount;
gpu->sim.blocks = deviceProp.multiProcessorCount;
if (deviceProp.regsPerBlock == 8192)
{
gpu->sim.nonbond_threads_per_block = G8X_NONBOND_THREADS_PER_BLOCK;
......@@ -1846,8 +1850,9 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
gpu->sim.max_localForces_threads_per_block = G8X_LOCALFORCES_THREADS_PER_BLOCK;
gpu->sim.threads_per_block = G8X_THREADS_PER_BLOCK;
gpu->sim.random_threads_per_block = G8X_RANDOM_THREADS_PER_BLOCK;
gpu->blocksPerSM = G8X_BLOCKS_PER_SM;
}
else
else if (deviceProp.regsPerBlock <= 16384)
{
gpu->sim.nonbond_threads_per_block = GT2XX_NONBOND_THREADS_PER_BLOCK;
gpu->sim.bornForce2_threads_per_block = GT2XX_BORNFORCE2_THREADS_PER_BLOCK;
......@@ -1856,7 +1861,23 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
gpu->sim.max_localForces_threads_per_block = GT2XX_LOCALFORCES_THREADS_PER_BLOCK;
gpu->sim.threads_per_block = GT2XX_NONBOND_THREADS_PER_BLOCK;
gpu->sim.random_threads_per_block = GT2XX_RANDOM_THREADS_PER_BLOCK;
gpu->blocksPerSM = GT2XX_BLOCKS_PER_SM;
}
else
{
gpu->sim.nonbond_threads_per_block = GF1XX_NONBOND_THREADS_PER_BLOCK;
gpu->sim.bornForce2_threads_per_block = GF1XX_BORNFORCE2_THREADS_PER_BLOCK;
gpu->sim.max_shake_threads_per_block = GF1XX_SHAKE_THREADS_PER_BLOCK;
gpu->sim.max_update_threads_per_block = GF1XX_UPDATE_THREADS_PER_BLOCK;
gpu->sim.max_localForces_threads_per_block = GF1XX_LOCALFORCES_THREADS_PER_BLOCK;
gpu->sim.threads_per_block = GF1XX_NONBOND_THREADS_PER_BLOCK;
gpu->sim.random_threads_per_block = GF1XX_RANDOM_THREADS_PER_BLOCK;
gpu->blocksPerSM = GF1XX_BLOCKS_PER_SM;
}
gpu->sim.nonbond_blocks = deviceProp.multiProcessorCount*gpu->blocksPerSM;
gpu->sim.bornForce2_blocks = deviceProp.multiProcessorCount*gpu->blocksPerSM;
gpu->sim.blocks = deviceProp.multiProcessorCount;
gpu->sim.shake_threads_per_block = gpu->sim.max_shake_threads_per_block;
gpu->sim.localForces_threads_per_block = gpu->sim.max_localForces_threads_per_block;
......
......@@ -55,7 +55,8 @@ enum SM_VERSION
{
SM_10,
SM_11,
SM_12
SM_12,
SM_20
};
......@@ -70,6 +71,7 @@ struct _gpuContext {
bool useBlockingSync;
gpuAtomType* gpAtomTable;
int gAtomTypes;
unsigned int blocksPerSM;
cudaGmxSimulation sim;
unsigned int* pOutputBufferCounter;
std::vector<std::vector<int> > exclusions;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment