Commit a409f0e8 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Update thread counts for Fermi-board

parent 80c69c93
...@@ -216,22 +216,44 @@ T& CUDAStream<T>::operator[](int index) ...@@ -216,22 +216,44 @@ T& CUDAStream<T>::operator[](int index)
static const unsigned int GRID = 32; static const unsigned int GRID = 32;
static const unsigned int GRIDBITS = 5; static const unsigned int GRIDBITS = 5;
static const int G8X_BLOCKS_PER_SM = 1;
static const int GT2XX_BLOCKS_PER_SM = 1;
static const int GF1XX_BLOCKS_PER_SM = 3;
static const int G8X_NONBOND_THREADS_PER_BLOCK = 256; static const int G8X_NONBOND_THREADS_PER_BLOCK = 256;
static const int GT2XX_NONBOND_THREADS_PER_BLOCK = 320; static const int GT2XX_NONBOND_THREADS_PER_BLOCK = 320;
static const int GF1XX_NONBOND_THREADS_PER_BLOCK = 256;
//static const int GF1XX_NONBOND_THREADS_PER_BLOCK = 768;
static const int G8X_BORNFORCE2_THREADS_PER_BLOCK = 256; static const int G8X_BORNFORCE2_THREADS_PER_BLOCK = 256;
static const int GT2XX_BORNFORCE2_THREADS_PER_BLOCK = 320; static const int GT2XX_BORNFORCE2_THREADS_PER_BLOCK = 320;
static const int GF1XX_BORNFORCE2_THREADS_PER_BLOCK = 256;
//static const int GF1XX_BORNFORCE2_THREADS_PER_BLOCK = 768;
static const int G8X_SHAKE_THREADS_PER_BLOCK = 128; static const int G8X_SHAKE_THREADS_PER_BLOCK = 128;
static const int GT2XX_SHAKE_THREADS_PER_BLOCK = 256; static const int GT2XX_SHAKE_THREADS_PER_BLOCK = 256;
static const int GF1XX_SHAKE_THREADS_PER_BLOCK = 512;
static const int G8X_UPDATE_THREADS_PER_BLOCK = 192; static const int G8X_UPDATE_THREADS_PER_BLOCK = 192;
static const int GT2XX_UPDATE_THREADS_PER_BLOCK = 384; static const int GT2XX_UPDATE_THREADS_PER_BLOCK = 384;
static const int GF1XX_UPDATE_THREADS_PER_BLOCK = 768;
static const int G8X_LOCALFORCES_THREADS_PER_BLOCK = 192; static const int G8X_LOCALFORCES_THREADS_PER_BLOCK = 192;
static const int GT2XX_LOCALFORCES_THREADS_PER_BLOCK = 384; static const int GT2XX_LOCALFORCES_THREADS_PER_BLOCK = 384;
static const int GF1XX_LOCALFORCES_THREADS_PER_BLOCK = 768;
static const int G8X_THREADS_PER_BLOCK = 256; static const int G8X_THREADS_PER_BLOCK = 256;
static const int GT2XX_THREADS_PER_BLOCK = 256; static const int GT2XX_THREADS_PER_BLOCK = 256;
static const int GF1XX_THREADS_PER_BLOCK = 512;
static const int G8X_RANDOM_THREADS_PER_BLOCK = 256; static const int G8X_RANDOM_THREADS_PER_BLOCK = 256;
static const int GT2XX_RANDOM_THREADS_PER_BLOCK = 384; static const int GT2XX_RANDOM_THREADS_PER_BLOCK = 384;
static const int GF1XX_RANDOM_THREADS_PER_BLOCK = 768;
static const int G8X_NONBOND_WORKUNITS_PER_SM = 220; static const int G8X_NONBOND_WORKUNITS_PER_SM = 220;
static const int GT2XX_NONBOND_WORKUNITS_PER_SM = 256; static const int GT2XX_NONBOND_WORKUNITS_PER_SM = 256;
static const int GF1XX_NONBOND_WORKUNITS_PER_SM = 256;
static const unsigned int MAX_STACK_SIZE = 8; static const unsigned int MAX_STACK_SIZE = 8;
static const unsigned int MAX_TABULATED_FUNCTIONS = 4; static const unsigned int MAX_TABULATED_FUNCTIONS = 4;
...@@ -265,6 +287,7 @@ struct cudaGmxSimulation { ...@@ -265,6 +287,7 @@ struct cudaGmxSimulation {
unsigned int atoms; // Number of atoms unsigned int atoms; // Number of atoms
unsigned int paddedNumberOfAtoms; // Padded number of atoms unsigned int paddedNumberOfAtoms; // Padded number of atoms
unsigned int blocks; // Number of blocks to launch across linear kernels unsigned int blocks; // Number of blocks to launch across linear kernels
unsigned int blocksPerSM; // Number of blocks per share memory
unsigned int nonbond_blocks; // Number of blocks to launch across CDLJ and Born Force Part1 unsigned int nonbond_blocks; // Number of blocks to launch across CDLJ and Born Force Part1
unsigned int bornForce2_blocks; // Number of blocks to launch across Born Force 2 unsigned int bornForce2_blocks; // Number of blocks to launch across Born Force 2
unsigned int interaction_blocks; // Number of blocks to launch when identifying interacting tiles unsigned int interaction_blocks; // Number of blocks to launch when identifying interacting tiles
......
...@@ -1352,9 +1352,11 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -1352,9 +1352,11 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
// Find connected constraints for CCMA. // Find connected constraints for CCMA.
vector<int> ccmaConstraints; vector<int> ccmaConstraints;
/*
for (unsigned i = 0; i < atom1.size(); i++) for (unsigned i = 0; i < atom1.size(); i++)
if (!isShakeAtom[atom1[i]]) if (!isShakeAtom[atom1[i]])
ccmaConstraints.push_back(i); ccmaConstraints.push_back(i);
*/
// Record the connections between constraints. // Record the connections between constraints.
...@@ -1832,11 +1834,13 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync) ...@@ -1832,11 +1834,13 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
gpu->sim.workUnitsPerSM = GT2XX_NONBOND_WORKUNITS_PER_SM; gpu->sim.workUnitsPerSM = GT2XX_NONBOND_WORKUNITS_PER_SM;
break; break;
} }
}
else
{
gpu->sm_version = SM_20;
gpu->sim.workUnitsPerSM = GF1XX_NONBOND_WORKUNITS_PER_SM;
} }
gpu->sim.nonbond_blocks = deviceProp.multiProcessorCount;
gpu->sim.bornForce2_blocks = deviceProp.multiProcessorCount;
gpu->sim.blocks = deviceProp.multiProcessorCount;
if (deviceProp.regsPerBlock == 8192) if (deviceProp.regsPerBlock == 8192)
{ {
gpu->sim.nonbond_threads_per_block = G8X_NONBOND_THREADS_PER_BLOCK; gpu->sim.nonbond_threads_per_block = G8X_NONBOND_THREADS_PER_BLOCK;
...@@ -1846,8 +1850,9 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync) ...@@ -1846,8 +1850,9 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
gpu->sim.max_localForces_threads_per_block = G8X_LOCALFORCES_THREADS_PER_BLOCK; gpu->sim.max_localForces_threads_per_block = G8X_LOCALFORCES_THREADS_PER_BLOCK;
gpu->sim.threads_per_block = G8X_THREADS_PER_BLOCK; gpu->sim.threads_per_block = G8X_THREADS_PER_BLOCK;
gpu->sim.random_threads_per_block = G8X_RANDOM_THREADS_PER_BLOCK; gpu->sim.random_threads_per_block = G8X_RANDOM_THREADS_PER_BLOCK;
gpu->blocksPerSM = G8X_BLOCKS_PER_SM;
} }
else else if (deviceProp.regsPerBlock <= 16384)
{ {
gpu->sim.nonbond_threads_per_block = GT2XX_NONBOND_THREADS_PER_BLOCK; gpu->sim.nonbond_threads_per_block = GT2XX_NONBOND_THREADS_PER_BLOCK;
gpu->sim.bornForce2_threads_per_block = GT2XX_BORNFORCE2_THREADS_PER_BLOCK; gpu->sim.bornForce2_threads_per_block = GT2XX_BORNFORCE2_THREADS_PER_BLOCK;
...@@ -1856,7 +1861,23 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync) ...@@ -1856,7 +1861,23 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
gpu->sim.max_localForces_threads_per_block = GT2XX_LOCALFORCES_THREADS_PER_BLOCK; gpu->sim.max_localForces_threads_per_block = GT2XX_LOCALFORCES_THREADS_PER_BLOCK;
gpu->sim.threads_per_block = GT2XX_NONBOND_THREADS_PER_BLOCK; gpu->sim.threads_per_block = GT2XX_NONBOND_THREADS_PER_BLOCK;
gpu->sim.random_threads_per_block = GT2XX_RANDOM_THREADS_PER_BLOCK; gpu->sim.random_threads_per_block = GT2XX_RANDOM_THREADS_PER_BLOCK;
gpu->blocksPerSM = GT2XX_BLOCKS_PER_SM;
} }
else
{
gpu->sim.nonbond_threads_per_block = GF1XX_NONBOND_THREADS_PER_BLOCK;
gpu->sim.bornForce2_threads_per_block = GF1XX_BORNFORCE2_THREADS_PER_BLOCK;
gpu->sim.max_shake_threads_per_block = GF1XX_SHAKE_THREADS_PER_BLOCK;
gpu->sim.max_update_threads_per_block = GF1XX_UPDATE_THREADS_PER_BLOCK;
gpu->sim.max_localForces_threads_per_block = GF1XX_LOCALFORCES_THREADS_PER_BLOCK;
gpu->sim.threads_per_block = GF1XX_NONBOND_THREADS_PER_BLOCK;
gpu->sim.random_threads_per_block = GF1XX_RANDOM_THREADS_PER_BLOCK;
gpu->blocksPerSM = GF1XX_BLOCKS_PER_SM;
}
gpu->sim.nonbond_blocks = deviceProp.multiProcessorCount*gpu->blocksPerSM;
gpu->sim.bornForce2_blocks = deviceProp.multiProcessorCount*gpu->blocksPerSM;
gpu->sim.blocks = deviceProp.multiProcessorCount;
gpu->sim.shake_threads_per_block = gpu->sim.max_shake_threads_per_block; gpu->sim.shake_threads_per_block = gpu->sim.max_shake_threads_per_block;
gpu->sim.localForces_threads_per_block = gpu->sim.max_localForces_threads_per_block; gpu->sim.localForces_threads_per_block = gpu->sim.max_localForces_threads_per_block;
......
...@@ -55,7 +55,8 @@ enum SM_VERSION ...@@ -55,7 +55,8 @@ enum SM_VERSION
{ {
SM_10, SM_10,
SM_11, SM_11,
SM_12 SM_12,
SM_20
}; };
...@@ -70,6 +71,7 @@ struct _gpuContext { ...@@ -70,6 +71,7 @@ struct _gpuContext {
bool useBlockingSync; bool useBlockingSync;
gpuAtomType* gpAtomTable; gpuAtomType* gpAtomTable;
int gAtomTypes; int gAtomTypes;
unsigned int blocksPerSM;
cudaGmxSimulation sim; cudaGmxSimulation sim;
unsigned int* pOutputBufferCounter; unsigned int* pOutputBufferCounter;
std::vector<std::vector<int> > exclusions; std::vector<std::vector<int> > exclusions;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment