Update thread counts for Fermi-board

a409f0e8 · Mark Friedrichs · 80c69c93 · a409f0e8 · a409f0e8 · a409f0e8
Commit a409f0e8 authored Mar 19, 2010 by Mark Friedrichs
3 changed files
--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
@@ -216,22 +216,44 @@ T& CUDAStream<T>::operator[](int index)
 static const unsigned int GRID = 32;
 static const unsigned int GRIDBITS = 5;
+static const int G8X_BLOCKS_PER_SM                      = 1;
+static const int GT2XX_BLOCKS_PER_SM                    = 1;
+static const int GF1XX_BLOCKS_PER_SM                    = 3;
 static const int G8X_NONBOND_THREADS_PER_BLOCK          = 256;
 static const int GT2XX_NONBOND_THREADS_PER_BLOCK        = 320;
+static const int GF1XX_NONBOND_THREADS_PER_BLOCK        = 256;
+//static const int GF1XX_NONBOND_THREADS_PER_BLOCK        = 768;
 static const int G8X_BORNFORCE2_THREADS_PER_BLOCK       = 256;
 static const int GT2XX_BORNFORCE2_THREADS_PER_BLOCK     = 320;
+static const int GF1XX_BORNFORCE2_THREADS_PER_BLOCK     = 256;
+//static const int GF1XX_BORNFORCE2_THREADS_PER_BLOCK     = 768;
 static const int G8X_SHAKE_THREADS_PER_BLOCK            = 128;
 static const int GT2XX_SHAKE_THREADS_PER_BLOCK          = 256;
+static const int GF1XX_SHAKE_THREADS_PER_BLOCK          = 512;
 static const int G8X_UPDATE_THREADS_PER_BLOCK           = 192;
 static const int GT2XX_UPDATE_THREADS_PER_BLOCK         = 384;
+static const int GF1XX_UPDATE_THREADS_PER_BLOCK         = 768;
 static const int G8X_LOCALFORCES_THREADS_PER_BLOCK      = 192;
 static const int GT2XX_LOCALFORCES_THREADS_PER_BLOCK    = 384;
+static const int GF1XX_LOCALFORCES_THREADS_PER_BLOCK    = 768;
 static const int G8X_THREADS_PER_BLOCK                  = 256;
 static const int GT2XX_THREADS_PER_BLOCK                = 256;
+static const int GF1XX_THREADS_PER_BLOCK                = 512;
 static const int G8X_RANDOM_THREADS_PER_BLOCK           = 256;
 static const int GT2XX_RANDOM_THREADS_PER_BLOCK         = 384;
+static const int GF1XX_RANDOM_THREADS_PER_BLOCK         = 768;
 static const int G8X_NONBOND_WORKUNITS_PER_SM           = 220;
 static const int GT2XX_NONBOND_WORKUNITS_PER_SM         = 256;
+static const int GF1XX_NONBOND_WORKUNITS_PER_SM         = 256;
 static const unsigned int MAX_STACK_SIZE = 8;
 static const unsigned int MAX_TABULATED_FUNCTIONS = 4;
@@ -265,6 +287,7 @@ struct cudaGmxSimulation {
    unsigned int    atoms;                          // Number of atoms
    unsigned int    paddedNumberOfAtoms;            // Padded number of atoms
    unsigned int    blocks;                         // Number of blocks to launch across linear kernels
+    unsigned int    blocksPerSM;                    // Number of blocks per share memory
    unsigned int    nonbond_blocks;                 // Number of blocks to launch across CDLJ and Born Force Part1
    unsigned int    bornForce2_blocks;              // Number of blocks to launch across Born Force 2
    unsigned int    interaction_blocks;             // Number of blocks to launch when identifying interacting tiles

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -1352,9 +1352,11 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
    // Find connected constraints for CCMA.
    vector<int> ccmaConstraints;
+/*
    for (unsigned i = 0; i < atom1.size(); i++)
        if (!isShakeAtom[atom1[i]])
            ccmaConstraints.push_back(i);
+*/
    // Record the connections between constraints.
@@ -1832,11 +1834,13 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
            gpu->sim.workUnitsPerSM = GT2XX_NONBOND_WORKUNITS_PER_SM;
            break;
        }
+    } 
+    else
+    {    
+        gpu->sm_version = SM_20;
+        gpu->sim.workUnitsPerSM = GF1XX_NONBOND_WORKUNITS_PER_SM;
    }
-    gpu->sim.nonbond_blocks = deviceProp.multiProcessorCount;
-    gpu->sim.bornForce2_blocks = deviceProp.multiProcessorCount;
-    gpu->sim.blocks = deviceProp.multiProcessorCount;
    if (deviceProp.regsPerBlock == 8192)
    {
        gpu->sim.nonbond_threads_per_block          = G8X_NONBOND_THREADS_PER_BLOCK;
@@ -1846,8 +1850,9 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
        gpu->sim.max_localForces_threads_per_block  = G8X_LOCALFORCES_THREADS_PER_BLOCK;
        gpu->sim.threads_per_block                  = G8X_THREADS_PER_BLOCK;
        gpu->sim.random_threads_per_block           = G8X_RANDOM_THREADS_PER_BLOCK;
+        gpu->blocksPerSM                            = G8X_BLOCKS_PER_SM;
    }
-    else
+    else if (deviceProp.regsPerBlock <= 16384)
    {
        gpu->sim.nonbond_threads_per_block          = GT2XX_NONBOND_THREADS_PER_BLOCK;
        gpu->sim.bornForce2_threads_per_block       = GT2XX_BORNFORCE2_THREADS_PER_BLOCK;
@@ -1856,7 +1861,23 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
        gpu->sim.max_localForces_threads_per_block  = GT2XX_LOCALFORCES_THREADS_PER_BLOCK;
        gpu->sim.threads_per_block                  = GT2XX_NONBOND_THREADS_PER_BLOCK;
        gpu->sim.random_threads_per_block           = GT2XX_RANDOM_THREADS_PER_BLOCK;
+        gpu->blocksPerSM                            = GT2XX_BLOCKS_PER_SM;
    }
+    else
+    {
+        gpu->sim.nonbond_threads_per_block          = GF1XX_NONBOND_THREADS_PER_BLOCK;
+        gpu->sim.bornForce2_threads_per_block       = GF1XX_BORNFORCE2_THREADS_PER_BLOCK;
+        gpu->sim.max_shake_threads_per_block        = GF1XX_SHAKE_THREADS_PER_BLOCK;
+        gpu->sim.max_update_threads_per_block       = GF1XX_UPDATE_THREADS_PER_BLOCK;
+        gpu->sim.max_localForces_threads_per_block  = GF1XX_LOCALFORCES_THREADS_PER_BLOCK;
+        gpu->sim.threads_per_block                  = GF1XX_NONBOND_THREADS_PER_BLOCK;
+        gpu->sim.random_threads_per_block           = GF1XX_RANDOM_THREADS_PER_BLOCK;
+        gpu->blocksPerSM                            = GF1XX_BLOCKS_PER_SM;
+    }
+    gpu->sim.nonbond_blocks = deviceProp.multiProcessorCount*gpu->blocksPerSM;
+    gpu->sim.bornForce2_blocks = deviceProp.multiProcessorCount*gpu->blocksPerSM;
+    gpu->sim.blocks = deviceProp.multiProcessorCount;
    gpu->sim.shake_threads_per_block                = gpu->sim.max_shake_threads_per_block;
    gpu->sim.localForces_threads_per_block          = gpu->sim.max_localForces_threads_per_block;

--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
@@ -55,7 +55,8 @@ enum SM_VERSION
 {
    SM_10,
    SM_11,
-    SM_12
+    SM_12,
+    SM_20
 };
@@ -70,6 +71,7 @@ struct _gpuContext {
    bool useBlockingSync;
    gpuAtomType* gpAtomTable;
    int gAtomTypes;
+    unsigned int blocksPerSM;
    cudaGmxSimulation sim;
    unsigned int* pOutputBufferCounter;
    std::vector<std::vector<int> > exclusions;