"wrappers/python/vscode:/vscode.git/clone" did not exist on "234425d1741fce83813052f8fab2cd092129cc67"
Commit 734d5d67 authored by Scott Le Grand's avatar Scott Le Grand
Browse files

No commit message

No commit message
parent a409f0e8
...@@ -222,12 +222,12 @@ static const int GF1XX_BLOCKS_PER_SM = 3; ...@@ -222,12 +222,12 @@ static const int GF1XX_BLOCKS_PER_SM = 3;
static const int G8X_NONBOND_THREADS_PER_BLOCK = 256; static const int G8X_NONBOND_THREADS_PER_BLOCK = 256;
static const int GT2XX_NONBOND_THREADS_PER_BLOCK = 320; static const int GT2XX_NONBOND_THREADS_PER_BLOCK = 320;
static const int GF1XX_NONBOND_THREADS_PER_BLOCK = 256; static const int GF1XX_NONBOND_THREADS_PER_BLOCK = 768;
//static const int GF1XX_NONBOND_THREADS_PER_BLOCK = 768; //static const int GF1XX_NONBOND_THREADS_PER_BLOCK = 768;
static const int G8X_BORNFORCE2_THREADS_PER_BLOCK = 256; static const int G8X_BORNFORCE2_THREADS_PER_BLOCK = 256;
static const int GT2XX_BORNFORCE2_THREADS_PER_BLOCK = 320; static const int GT2XX_BORNFORCE2_THREADS_PER_BLOCK = 320;
static const int GF1XX_BORNFORCE2_THREADS_PER_BLOCK = 256; static const int GF1XX_BORNFORCE2_THREADS_PER_BLOCK = 768;
//static const int GF1XX_BORNFORCE2_THREADS_PER_BLOCK = 768; //static const int GF1XX_BORNFORCE2_THREADS_PER_BLOCK = 768;
static const int G8X_SHAKE_THREADS_PER_BLOCK = 128; static const int G8X_SHAKE_THREADS_PER_BLOCK = 128;
...@@ -252,7 +252,7 @@ static const int GF1XX_RANDOM_THREADS_PER_BLOCK = 768; ...@@ -252,7 +252,7 @@ static const int GF1XX_RANDOM_THREADS_PER_BLOCK = 768;
static const int G8X_NONBOND_WORKUNITS_PER_SM = 220; static const int G8X_NONBOND_WORKUNITS_PER_SM = 220;
static const int GT2XX_NONBOND_WORKUNITS_PER_SM = 256; static const int GT2XX_NONBOND_WORKUNITS_PER_SM = 256;
static const int GF1XX_NONBOND_WORKUNITS_PER_SM = 256; static const int GF1XX_NONBOND_WORKUNITS_PER_SM = 768;
static const unsigned int MAX_STACK_SIZE = 8; static const unsigned int MAX_STACK_SIZE = 8;
static const unsigned int MAX_TABULATED_FUNCTIONS = 4; static const unsigned int MAX_TABULATED_FUNCTIONS = 4;
......
...@@ -51,7 +51,15 @@ void GetBrownianUpdateSim(gpuContext gpu) ...@@ -51,7 +51,15 @@ void GetBrownianUpdateSim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed"); RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
} }
__global__ void kBrownianUpdatePart1_kernel() __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
void kBrownianUpdatePart1_kernel()
{ {
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int rpos = cSim.pRandomPosition[blockIdx.x]; unsigned int rpos = cSim.pRandomPosition[blockIdx.x];
...@@ -82,7 +90,15 @@ void kBrownianUpdatePart1(gpuContext gpu) ...@@ -82,7 +90,15 @@ void kBrownianUpdatePart1(gpuContext gpu)
LAUNCHERROR("kBrownianUpdatePart1"); LAUNCHERROR("kBrownianUpdatePart1");
} }
__global__ void kBrownianUpdatePart2_kernel() __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
void kBrownianUpdatePart2_kernel()
{ {
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int rpos = cSim.pRandomPosition[blockIdx.x]; unsigned int rpos = cSim.pRandomPosition[blockIdx.x];
......
...@@ -33,7 +33,15 @@ ...@@ -33,7 +33,15 @@
/* Cuda compiler on Windows does not recognized "static const float" values */ /* Cuda compiler on Windows does not recognized "static const float" values */
#define LOCAL_HACK_PI 3.1415926535897932384626433832795 #define LOCAL_HACK_PI 3.1415926535897932384626433832795
__global__ void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit) __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateCDLJ, Forces_kernel)(unsigned int* workUnit)
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
unsigned int totalWarps = gridDim.x*blockDim.x/GRID; unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
......
...@@ -33,7 +33,15 @@ ...@@ -33,7 +33,15 @@
/* Cuda compiler on Windows does not recognized "static const float" values */ /* Cuda compiler on Windows does not recognized "static const float" values */
#define LOCAL_HACK_PI 3.1415926535897932384626433832795 #define LOCAL_HACK_PI 3.1415926535897932384626433832795
__global__ void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit) __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateCDLJObcGbsa, Forces1_kernel)(unsigned int* workUnit)
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID; unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
......
...@@ -118,7 +118,15 @@ void SetCustomTorsionGlobalParams(const vector<float>& paramValues) ...@@ -118,7 +118,15 @@ void SetCustomTorsionGlobalParams(const vector<float>& paramValues)
angle = (dp >= 0) ? angle : -angle; \ angle = (dp >= 0) ? angle : -angle; \
} }
__global__ void kCalculateCustomTorsionForces_kernel() __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_LOCALFORCES_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_LOCALFORCES_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_LOCALFORCES_THREADS_PER_BLOCK, 1)
#endif
void kCalculateCustomTorsionForces_kernel()
{ {
extern __shared__ float stack[]; extern __shared__ float stack[];
float* variables = (float*) &stack[cSim.customExpressionStackSize*blockDim.x]; float* variables = (float*) &stack[cSim.customExpressionStackSize*blockDim.x];
......
...@@ -37,7 +37,15 @@ ...@@ -37,7 +37,15 @@
#include "kCalculateGBVIAux.h" #include "kCalculateGBVIAux.h"
__global__ void METHOD_NAME(kCalculateGBVI, BornSum_kernel)(unsigned int* workUnit) __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateGBVI, BornSum_kernel)(unsigned int* workUnit)
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
......
...@@ -37,7 +37,15 @@ ...@@ -37,7 +37,15 @@
* different versions of the kernels. * different versions of the kernels.
*/ */
__global__ void METHOD_NAME(kCalculateGBVI, Forces2_kernel)(unsigned int* workUnit, unsigned int numWorkUnits) __global__ void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_BORNFORCE2_THREADS_PER_BLOCK, 1)
#endif
METHOD_NAME(kCalculateGBVI, Forces2_kernel)(unsigned int* workUnit, unsigned int numWorkUnits)
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID; unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
......
...@@ -123,7 +123,15 @@ void GetCalculateLocalForcesSim(gpuContext gpu) ...@@ -123,7 +123,15 @@ void GetCalculateLocalForcesSim(gpuContext gpu)
} }
__global__ void kCalculateLocalForces_kernel() __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_LOCALFORCES_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_LOCALFORCES_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_LOCALFORCES_THREADS_PER_BLOCK, 1)
#endif
void kCalculateLocalForces_kernel()
{ {
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
Vectors* A = &sV[threadIdx.x]; Vectors* A = &sV[threadIdx.x];
......
...@@ -97,7 +97,9 @@ void GetCalculateObcGbsaBornSumSim(gpuContext gpu) ...@@ -97,7 +97,9 @@ void GetCalculateObcGbsaBornSumSim(gpuContext gpu)
#include "kCalculateObcGbsaBornSum.h" #include "kCalculateObcGbsaBornSum.h"
__global__ void kClearObcGbsaBornSum_kernel() __global__
__launch_bounds__(384, 1)
void kClearObcGbsaBornSum_kernel()
{ {
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride * cSim.nonbondOutputBuffers) while (pos < cSim.stride * cSim.nonbondOutputBuffers)
...@@ -107,7 +109,9 @@ __global__ void kClearObcGbsaBornSum_kernel() ...@@ -107,7 +109,9 @@ __global__ void kClearObcGbsaBornSum_kernel()
} }
} }
__global__ void kReduceObcGbsaBornSum_kernel() __global__
__launch_bounds__(384, 1)
void kReduceObcGbsaBornSum_kernel()
{ {
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
......
...@@ -30,7 +30,15 @@ ...@@ -30,7 +30,15 @@
* different versions of the kernels. * different versions of the kernels.
*/ */
__global__ void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit) __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateObcGbsa, BornSum_kernel)(unsigned int* workUnit)
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
/* /*
......
...@@ -30,7 +30,15 @@ ...@@ -30,7 +30,15 @@
* different versions of the kernels. * different versions of the kernels.
*/ */
__global__ void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit) __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_BORNFORCE2_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateObcGbsa, Forces2_kernel)(unsigned int* workUnit)
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID; unsigned int totalWarps = cSim.bornForce2_blocks*cSim.bornForce2_threads_per_block/GRID;
......
...@@ -53,7 +53,9 @@ void GetForcesSim(gpuContext gpu) ...@@ -53,7 +53,9 @@ void GetForcesSim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: GetForcesSim copy from cSim failed"); RTERROR(status, "cudaMemcpyFromSymbol: GetForcesSim copy from cSim failed");
} }
__global__ void kClearForces_kernel() __global__
__launch_bounds__(384, 1)
void kClearForces_kernel()
{ {
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride * cSim.outputBuffers) while (pos < cSim.stride * cSim.outputBuffers)
...@@ -70,7 +72,9 @@ void kClearForces(gpuContext gpu) ...@@ -70,7 +72,9 @@ void kClearForces(gpuContext gpu)
LAUNCHERROR("kClearForces"); LAUNCHERROR("kClearForces");
} }
__global__ void kClearBornForces_kernel() __global__
__launch_bounds__(384, 1)
void kClearBornForces_kernel()
{ {
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride * cSim.nonbondOutputBuffers) while (pos < cSim.stride * cSim.nonbondOutputBuffers)
...@@ -87,7 +91,9 @@ void kClearBornForces(gpuContext gpu) ...@@ -87,7 +91,9 @@ void kClearBornForces(gpuContext gpu)
LAUNCHERROR("kClearBornForces"); LAUNCHERROR("kClearBornForces");
} }
__global__ void kClearEnergy_kernel() __global__
__launch_bounds__(384, 1)
void kClearEnergy_kernel()
{ {
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.energyOutputBuffers) while (pos < cSim.energyOutputBuffers)
...@@ -104,7 +110,15 @@ void kClearEnergy(gpuContext gpu) ...@@ -104,7 +110,15 @@ void kClearEnergy(gpuContext gpu)
LAUNCHERROR("kClearEnergy"); LAUNCHERROR("kClearEnergy");
} }
__global__ void kReduceBornSumAndForces_kernel() __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceBornSumAndForces_kernel()
{ {
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
...@@ -207,7 +221,15 @@ void kReduceBornSumAndForces(gpuContext gpu) ...@@ -207,7 +221,15 @@ void kReduceBornSumAndForces(gpuContext gpu)
LAUNCHERROR("kReduceBornSumAndForces"); LAUNCHERROR("kReduceBornSumAndForces");
} }
__global__ void kReduceForces_kernel() __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceForces_kernel()
{ {
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
...@@ -269,7 +291,15 @@ double kReduceEnergy(gpuContext gpu) ...@@ -269,7 +291,15 @@ double kReduceEnergy(gpuContext gpu)
return sum; return sum;
} }
__global__ void kReduceObcGbsaBornForces_kernel() __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
void kReduceObcGbsaBornForces_kernel()
{ {
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
float energy = 0.0f; float energy = 0.0f;
...@@ -327,7 +357,15 @@ __global__ void kReduceObcGbsaBornForces_kernel() ...@@ -327,7 +357,15 @@ __global__ void kReduceObcGbsaBornForces_kernel()
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy / -6.0f; cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy / -6.0f;
} }
__global__ void kReduceGBVIBornForces_kernel() __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
void kReduceGBVIBornForces_kernel()
{ {
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x); unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
float energy = 0.0f; float energy = 0.0f;
......
...@@ -101,7 +101,15 @@ void kLangevinUpdatePart2(gpuContext gpu) ...@@ -101,7 +101,15 @@ void kLangevinUpdatePart2(gpuContext gpu)
} }
__global__ void kSelectLangevinStepSize_kernel(float maxStepSize) __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
void kSelectLangevinStepSize_kernel(float maxStepSize)
{ {
// Calculate the error. // Calculate the error.
......
...@@ -31,13 +31,21 @@ ...@@ -31,13 +31,21 @@
* different versions of the kernels. * different versions of the kernels.
*/ */
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
#ifdef REMOVE_CM #ifdef REMOVE_CM
__global__ void kLangevinUpdatePart1CM_kernel() void kLangevinUpdatePart1CM_kernel()
#else #else
__global__ void kLangevinUpdatePart1_kernel() void kLangevinUpdatePart1_kernel()
#endif #endif
{ {
__shared__ float params[MaxParams]; __shared__ volatile float params[MaxParams];
if (threadIdx.x < MaxParams) if (threadIdx.x < MaxParams)
params[threadIdx.x] = cSim.pLangevinParameters[threadIdx.x]; params[threadIdx.x] = cSim.pLangevinParameters[threadIdx.x];
__syncthreads(); __syncthreads();
...@@ -127,10 +135,18 @@ __global__ void kLangevinUpdatePart1_kernel() ...@@ -127,10 +135,18 @@ __global__ void kLangevinUpdatePart1_kernel()
} }
} }
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
#ifdef REMOVE_CM #ifdef REMOVE_CM
__global__ void kLangevinUpdatePart2CM_kernel() void kLangevinUpdatePart2CM_kernel()
#else #else
__global__ void kLangevinUpdatePart2_kernel() void kLangevinUpdatePart2_kernel()
#endif #endif
{ {
__shared__ float params[MaxParams]; __shared__ float params[MaxParams];
......
...@@ -54,7 +54,15 @@ void GetRandomSim(gpuContext gpu) ...@@ -54,7 +54,15 @@ void GetRandomSim(gpuContext gpu)
extern __shared__ float3 sRand[]; extern __shared__ float3 sRand[];
__global__ void kGenerateRandoms_kernel() __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_RANDOM_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_RANDOM_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_RANDOM_THREADS_PER_BLOCK, 1)
#endif
void kGenerateRandoms_kernel()
{ {
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x; unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int increment = blockDim.x * gridDim.x; unsigned int increment = blockDim.x * gridDim.x;
......
...@@ -57,7 +57,15 @@ void GetSettleSim(gpuContext gpu) ...@@ -57,7 +57,15 @@ void GetSettleSim(gpuContext gpu)
* S. Miyamoto and P. Kollman, J. Comp. Chem., vol 13, no. 8, pp. 952-962 (1992). * S. Miyamoto and P. Kollman, J. Comp. Chem., vol 13, no. 8, pp. 952-962 (1992).
*/ */
__global__ void kApplyFirstSettle_kernel() __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_SHAKE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
#endif
void kApplyFirstSettle_kernel()
{ {
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.settleConstraints) while (pos < cSim.settleConstraints)
...@@ -231,7 +239,15 @@ void kApplyFirstSettle(gpuContext gpu) ...@@ -231,7 +239,15 @@ void kApplyFirstSettle(gpuContext gpu)
} }
} }
__global__ void kApplySecondSettle_kernel() __global__ void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_SHAKE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
#endif
kApplySecondSettle_kernel()
{ {
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.settleConstraints) while (pos < cSim.settleConstraints)
......
...@@ -64,9 +64,23 @@ void GetShakeHSim(gpuContext gpu) ...@@ -64,9 +64,23 @@ void GetShakeHSim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed"); RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
} }
__global__ void kApplyFirstShake_kernel() __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_SHAKE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
#endif
void kApplyFirstShake_kernel()
{ {
__shared__ Atom sA[G8X_THREADS_PER_BLOCK]; #if (__CUDA_ARCH__ >= 200)
__shared__ Atom sA[GF1XX_SHAKE_THREADS_PER_BLOCK];
#elif (__CUDA_ARCH__ >= 130)
__shared__ Atom sA[GT2XX_SHAKE_THREADS_PER_BLOCK];
#else
__shared__ Atom sA[G8X_SHAKE_THREADS_PER_BLOCK];
#endif
Atom* psA = &sA[threadIdx.x]; Atom* psA = &sA[threadIdx.x];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.ShakeConstraints) while (pos < cSim.ShakeConstraints)
...@@ -226,9 +240,23 @@ void kApplyFirstShake(gpuContext gpu) ...@@ -226,9 +240,23 @@ void kApplyFirstShake(gpuContext gpu)
} }
} }
__global__ void kApplySecondShake_kernel() __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_SHAKE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
#endif
void kApplySecondShake_kernel()
{ {
__shared__ Atom sA[G8X_THREADS_PER_BLOCK]; #if (__CUDA_ARCH__ >= 200)
__shared__ Atom sA[GF1XX_SHAKE_THREADS_PER_BLOCK];
#elif (__CUDA_ARCH__ >= 130)
__shared__ Atom sA[GT2XX_SHAKE_THREADS_PER_BLOCK];
#else
__shared__ Atom sA[G8X_SHAKE_THREADS_PER_BLOCK];
#endif
Atom* psA = &sA[threadIdx.x]; Atom* psA = &sA[threadIdx.x];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.ShakeConstraints) while (pos < cSim.ShakeConstraints)
...@@ -394,7 +422,15 @@ __global__ void kApplySecondShake_kernel() ...@@ -394,7 +422,15 @@ __global__ void kApplySecondShake_kernel()
} }
} }
__global__ void kApplyNoShake_kernel() __global__ void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_SHAKE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
#endif
kApplyNoShake_kernel()
{ {
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.NonShakeConstraints) while (pos < cSim.NonShakeConstraints)
......
...@@ -89,7 +89,15 @@ void kVerletUpdatePart2(gpuContext gpu) ...@@ -89,7 +89,15 @@ void kVerletUpdatePart2(gpuContext gpu)
} }
} }
__global__ void kSelectVerletStepSize_kernel(float maxStepSize) __global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
void kSelectVerletStepSize_kernel(float maxStepSize)
{ {
// Calculate the error. // Calculate the error.
......
...@@ -30,15 +30,23 @@ ...@@ -30,15 +30,23 @@
* different versions of the kernels. * different versions of the kernels.
*/ */
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
#ifdef REMOVE_CM #ifdef REMOVE_CM
__global__ void kVerletUpdatePart1CM_kernel() void kVerletUpdatePart1CM_kernel()
#else #else
__global__ void kVerletUpdatePart1_kernel() void kVerletUpdatePart1_kernel()
#endif #endif
{ {
// Load the step size to take. // Load the step size to take.
__shared__ float dtPos; __shared__ volatile float dtPos;
__shared__ float dtVel; __shared__ volatile float dtVel;
if (threadIdx.x == 0) if (threadIdx.x == 0)
{ {
float2 stepSize = cSim.pStepSize[0]; float2 stepSize = cSim.pStepSize[0];
...@@ -111,10 +119,18 @@ __global__ void kVerletUpdatePart1_kernel() ...@@ -111,10 +119,18 @@ __global__ void kVerletUpdatePart1_kernel()
} }
} }
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_UPDATE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_UPDATE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_UPDATE_THREADS_PER_BLOCK, 1)
#endif
#ifdef REMOVE_CM #ifdef REMOVE_CM
__global__ void kVerletUpdatePart2CM_kernel() void kVerletUpdatePart2CM_kernel()
#else #else
__global__ void kVerletUpdatePart2_kernel() void kVerletUpdatePart2_kernel()
#endif #endif
{ {
// Load the step size to take. // Load the step size to take.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment