Commit 68ec5df7 authored by Peter Eastman's avatar Peter Eastman
Browse files

Implemented a new (much faster) charge spreading algorithm for PME

parent 59be3426
......@@ -117,7 +117,7 @@ ELSE(CUDA_BUILD_TYPE MATCHES "Emulation")
ENDIF(CUDA_BUILD_TYPE MATCHES "Emulation")
SET(CUDA_BUILD_CUBIN TRUE CACHE BOOL "Generate and parse .cubin files in Device mode.")
SET(CUDA_NVCC_FLAGS "-maxrregcount=32;-use_fast_math;-O0" CACHE STRING "Semi-colon delimit multiple arguments.")
SET(CUDA_NVCC_FLAGS "-maxrregcount=32;-use_fast_math;-O0;-arch=sm_11" CACHE STRING "Semi-colon delimit multiple arguments.")
# Search for the cuda distribution.
IF(NOT CUDA_INSTALL_PREFIX)
......
/*
* Authored by: Chen, Shifu
*
* Email: chen@gmtk.org
*
* Website: http://www.gmtk.org/gsort
*
* The code is distributed under BSD license, you are allowed to use, modify or sell this code, but a statement is required if you used this code any where.
*
*/
#include <stdio.h>
#include <stdlib.h>
#include "vector_types.h"
#include "bbsort.h"
#include "bbsort_kernel.cu"
float getValue(float2 v){
return v.y;
}
template <typename T>
T getValue(T v){
return v;
}
# define CUDA_SAFE_CALL_NO_SYNC( call) { \
cudaError err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} }
# define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NO_SYNC(call);
bool assignSliceToBuckets(unsigned int* sliceCount,int sliceSize,unsigned int* bucketOffset,unsigned int* bucketOfSlice,unsigned int* bucketSizes,unsigned int* sliceOffsetInBucket,int& bucketsCount,float step)
{
int i=0;
bool overflow=false;
int tmpSum=0;
bucketOffset[0]=0;
for(i=0;i<sliceSize; i++){
if(sliceCount[i] >BLOCK_SIZE)
{
overflow=true;
}
tmpSum += sliceCount[i];
bucketOfSlice[i]=bucketsCount;
bucketSizes[bucketsCount] = tmpSum;
sliceOffsetInBucket[i]=tmpSum -sliceCount[i];
if(tmpSum > BLOCK_SIZE )
{
if(i != 0)
{
bucketOfSlice[i]=bucketsCount+1;
bucketSizes[bucketsCount] -= sliceCount[i];
sliceOffsetInBucket[i]=0;
bucketOffset[bucketsCount+1]=bucketOffset[bucketsCount] + tmpSum - sliceCount[i];
bucketsCount++;
tmpSum=sliceCount[i];
bucketSizes[bucketsCount] = tmpSum;
}
else
{
bucketOffset[bucketsCount+1]=bucketOffset[bucketsCount] + tmpSum ;
sliceOffsetInBucket[i]=0;
tmpSum=0;
bucketsCount++;
}
}
}
bucketsCount++;
return overflow;
}
template <typename T>
void reduceMinMax(T* dData,int size,float& result,bool isMax)
{
int step;
step=(size%2==0)?
(size/2):(size/2 +1);
int blockSize=BLOCK_SIZE;
int blockCount;
int length=size;
T originalResult;
while(step > 0)
{
if(step%BLOCK_SIZE==0)
blockCount=step/BLOCK_SIZE;
else
blockCount=step/BLOCK_SIZE+1;
if(isMax)
reduceMaxD<<<blockCount,blockSize>>>(dData,step,length);
else
reduceMinD<<<blockCount,blockSize>>>(dData,step,length);
length=step;
step=(step%2==0 || step==1)?(step/2):(step/2 +1);
}
CUDA_SAFE_CALL(cudaMemcpy(&originalResult, dData, sizeof(T), cudaMemcpyDeviceToHost));
result=(float)getValue(originalResult);
}
template <typename T>
void evaluateDisorder(T* dData,int size,float maxValue, float minValue, int& listOrder)
{
int blockCount;
if((size-1) % BLOCK_SIZE ==0)blockCount=size/BLOCK_SIZE;
else blockCount=size/BLOCK_SIZE+1;
float* dDiffData;
CUDA_SAFE_CALL(cudaMalloc((void**)&dDiffData, sizeof(float) * size));
calDifferenceD<<<blockCount,BLOCK_SIZE,(BLOCK_SIZE)*sizeof(T)>>>(dData,dDiffData,size);
float sum=0;
int step;
step=(size%2==0)?
(size/2):(size/2 +1);
int blockSize=BLOCK_SIZE;
int length=size;
while(step > 0)
{
if(step%BLOCK_SIZE==0)
blockCount=step/BLOCK_SIZE;
else
blockCount=step/BLOCK_SIZE+1;
reduceSumD<<<blockCount,blockSize>>>(dDiffData,step,length);
length=step;
step=(step%2==0 || step==1)?(step/2):(step/2 +1);
}
CUDA_SAFE_CALL(cudaMemcpy(&sum, dDiffData, sizeof(float), cudaMemcpyDeviceToHost));
if( sum < (maxValue - minValue) * size / 10)
listOrder=NEARLY_SORTED;
else
listOrder=DISORDERLY;
CUDA_SAFE_CALL(cudaFree(dDiffData));
}
template <typename T>
void bbSortBody(T* dData,int size,int listOrder/*,float sliceStep,int sliceSize, T* dTmpData, float minValue,float maxValue*/)
{
float minValue,maxValue;
T* dTmpData;
CUDA_SAFE_CALL(cudaMalloc((void**)&dTmpData, sizeof(T) * size));
CUDA_SAFE_CALL(cudaMemcpy(dTmpData, dData, sizeof(T) * size, cudaMemcpyDeviceToDevice));
reduceMinMax(dTmpData,size,maxValue,true);
CUDA_SAFE_CALL(cudaMemcpy(dTmpData, dData, sizeof(T) * size, cudaMemcpyDeviceToDevice));
reduceMinMax(dTmpData,size,minValue,false);
if(minValue == maxValue)
{
CUDA_SAFE_CALL(cudaFree(dTmpData));
return ;
}
if(listOrder == AUTO_EVALUATE )
{
evaluateDisorder(dData,size,maxValue,minValue,listOrder);
}
float sliceStep =(50.0*((double)(maxValue-minValue)/(double)size));
int sliceSize = (maxValue-minValue)/sliceStep + 10;
int blockCount;
if(size%BLOCK_SIZE==0)blockCount=size/BLOCK_SIZE;
else blockCount=size/BLOCK_SIZE+1;
unsigned int* dSliceCounts;
unsigned int* dOffsetInSlice;
CUDA_SAFE_CALL(cudaMalloc((void**)&dOffsetInSlice, sizeof(unsigned int) * size));
CUDA_SAFE_CALL(cudaMalloc((void**)&dSliceCounts, sizeof(unsigned int) * sliceSize));
CUDA_SAFE_CALL(cudaMemset(dSliceCounts,0, sizeof(int) * sliceSize));
if(listOrder == NEARLY_SORTED)
{
assignElementToSlicesNearlySortedD<<<blockCount, BLOCK_SIZE>>>(dData,size,dSliceCounts,dOffsetInSlice,minValue,sliceStep,sliceSize,blockCount);
}
else
assignElementToSlicesD<<<blockCount, BLOCK_SIZE>>>(dData,size,dSliceCounts,dOffsetInSlice,minValue,sliceStep,sliceSize);
unsigned int* hSliceCounts=new unsigned int[sliceSize];
CUDA_SAFE_CALL(cudaMemcpy(hSliceCounts, dSliceCounts, sizeof(unsigned int) * sliceSize, cudaMemcpyDeviceToHost));
int looseBucketSize=size/100;
unsigned int* hBucketOffsets=new unsigned int[looseBucketSize];
unsigned int* hBucketSizes=new unsigned int[looseBucketSize];
unsigned int* hBucketOfSlices=new unsigned int[sliceSize];
unsigned int* hSliceOffsetInBucket=new unsigned int[sliceSize];
int bucketsCount=0;
memset(hBucketSizes,0,sizeof(int) * looseBucketSize);
memset(hSliceOffsetInBucket,0,sizeof(unsigned int) * sliceSize);
bool overflow;
overflow = assignSliceToBuckets(hSliceCounts,sliceSize,hBucketOffsets,hBucketOfSlices,hBucketSizes,hSliceOffsetInBucket,bucketsCount,sliceStep);
unsigned int* dBucketOffsets;
unsigned int* dBucketSizes;
unsigned int* dBucketOfSlices;
unsigned int* dSliceOffsetInBucket;
CUDA_SAFE_CALL(cudaMalloc((void**)&dBucketOfSlices, sizeof(unsigned int) * sliceSize));
CUDA_SAFE_CALL(cudaMalloc((void**)&dSliceOffsetInBucket, sizeof(unsigned int) * sliceSize));
CUDA_SAFE_CALL(cudaMalloc((void**)&dBucketOffsets, sizeof(unsigned int) * bucketsCount));
CUDA_SAFE_CALL(cudaMalloc((void**)&dBucketSizes, sizeof(unsigned int) * bucketsCount));
CUDA_SAFE_CALL(cudaMemcpy(dBucketOfSlices, hBucketOfSlices, sizeof(unsigned int) * sliceSize, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(dSliceOffsetInBucket, hSliceOffsetInBucket, sizeof(unsigned int) * sliceSize, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(dBucketOffsets, hBucketOffsets, sizeof(unsigned int) * bucketsCount, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(dBucketSizes, hBucketSizes, sizeof(unsigned int) * bucketsCount, cudaMemcpyHostToDevice));
cudaBindTexture(0,tBucketOffsets,dBucketOffsets);
cudaBindTexture(0,tBucketSizes,dBucketSizes);
cudaBindTexture(0,tBucketOfSlices,dBucketOfSlices);
cudaBindTexture(0,tSliceOffsetInBucket,dSliceOffsetInBucket);
assignElementToBucketD<<<blockCount, BLOCK_SIZE>>>(dData,dTmpData,size,dOffsetInSlice,minValue,sliceStep);
CUDA_SAFE_CALL( cudaThreadSynchronize() );
bitonicSortD<<<bucketsCount, BLOCK_SIZE, sizeof(T) * BLOCK_SIZE>>>(dTmpData);
CUDA_SAFE_CALL(cudaMemcpy(dData, dTmpData, sizeof(T) * size, cudaMemcpyDeviceToDevice));
if(overflow){
for(int i=0;i<bucketsCount;i++)
{
if(hBucketSizes[i] > BLOCK_SIZE)
{
bbSort(dData + hBucketOffsets[i],hBucketSizes[i],listOrder);
}
}
}
delete hBucketOffsets;
delete hBucketOfSlices;
delete hSliceCounts;
delete hBucketSizes;
CUDA_SAFE_CALL(cudaFree(dOffsetInSlice));
CUDA_SAFE_CALL(cudaFree(dSliceCounts));
CUDA_SAFE_CALL(cudaFree(dTmpData));
cudaUnbindTexture( tBucketSizes );
CUDA_SAFE_CALL(cudaFree(dBucketSizes));
cudaUnbindTexture( tBucketOffsets );
CUDA_SAFE_CALL(cudaFree(dBucketOffsets));
cudaUnbindTexture( tBucketOfSlices );
CUDA_SAFE_CALL(cudaFree(dBucketOfSlices));
cudaUnbindTexture( tSliceOffsetInBucket );
CUDA_SAFE_CALL(cudaFree(dSliceOffsetInBucket));
}
/************************************************************************************
Uncomment your desired function definition here
Please note that, only one type of bbsort() can be used in a program, due to NVCC compiler doesn't support overriding kernel function
float, double, int, uint, short, and ushort are originally supported, if you want to use bbsort() in double
please follow the readme.txt
Also note that you need to use 1.3 capbility (use arch=sm_13 in your compile command) to sort doubles
*************************************************************************************/
template<>
void bbSort(float2* dData,int size,int listOrder)
{
bbSortBody(dData,size,listOrder);
}
//void bbSort(float* dData,int size,int listOrder)
//{
//
// bbSortBody(dData,size,listOrder);
//}
//void bbSort(int* dData,int size,int listOrder)
//{
//
// bbSortBody(dData,size,listOrder);
//}
//
//void bbSort(unsigned int* dData,int size,int listOrder)
//{
//
// bbSortBody(dData,size,listOrder);
//}
//
//void bbSort(double* dData,int size,int listOrder)
//{
//
// bbSortBody(dData,size,listOrder);
//}
/*
* Authored by: Chen, Shifu
*
* Email: chen@gmtk.org
*
* Website: http://www.gmtk.org/gsort
*
* The code is distributed under BSD license, you are allowed to use, modify or sell this code, but a statement is required if you used this code any where.
*
*/
#ifndef _BBSORT_H_
#define _BBSORT_H_
#define BLOCK_SIZE 512
#define DISORDERLY 0
#define NEARLY_SORTED 1
#define AUTO_EVALUATE 2
template <typename T>
void bbSort(T* dData,int number,int listOrder=AUTO_EVALUATE);
#endif // _BBSORT_H_
/*
* Authored by: Chen, Shifu
*
* Email: chen@gmtk.org
*
* Website: http://www.gmtk.org/gsort
*
* The code is distributed under BSD license, you are allowed to use, modify or sell this code, but a statement is required if you used this code any where.
*
*/
#ifndef _BBSORT_KERNEL_H_
#define _BBSORT_KERNEL_H_
#include "bbsort.h"
#include "math_constants.h"
texture<unsigned int, 1, cudaReadModeElementType> tBucketSizes;
texture<unsigned int, 1, cudaReadModeElementType> tBucketOffsets;
texture<unsigned int, 1, cudaReadModeElementType> tBucketOfSlices;
texture<unsigned int, 1, cudaReadModeElementType> tSliceOffsetInBucket;
__device__ float dGetValue(float2 v){
return v.y;
}
template <typename T>
__device__ T dGetValue(T v){
return v;
}
__device__ void dPad(float2& v){
v.x=0x3fffffff;
v.y=0x4fffffff;
}
template <typename T>
__device__ void dPad(T & v){
v=0x7fffffff;
}
template <typename T>
__global__ static void reduceMaxD(T * dData,int step,int length)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if(index + step >=length)
return ;
dData[index] = dGetValue(dData[index])>dGetValue(dData[index+step])?dData[index]:dData[index+step];
}
template <typename T>
__global__ static void reduceMinD(T * dData,int step,int length)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if(index + step >=length)
return ;
dData[index] = dGetValue(dData[index])<dGetValue(dData[index+step])?dData[index]:dData[index+step];
}
__global__ static void reduceSumD(float * dDiffData,int step,int length)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if(index + step >=length)
return ;
dDiffData[index] += dDiffData[index+step];
}
template <typename T>
__global__ static void calDifferenceD(T * dData,float * dDiffData,int size)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if(index > size-1)
return ;
const unsigned int tid = threadIdx.x;
extern __shared__ T sData[];
sData[tid]=dData[index];
__syncthreads();
if(tid < blockDim.x -1)
dDiffData[index] = abs(dGetValue(sData[tid+1]) - dGetValue(sData[tid]));
else
dDiffData[index] =0;
}
template <typename T>
__device__ inline void dSwap(T & a, T & b)
{
T tmp = a;
a = b;
b = tmp;
}
template <typename T>
__global__ static void bitonicSortD(T * datas)
{
extern __shared__ T shared[];
const unsigned int bid=blockIdx.x;
const unsigned int tid = threadIdx.x;
__shared__ unsigned int count;
__shared__ unsigned int offset;
if(tid == 0)
{
count=tex1Dfetch(tBucketSizes,bid);
offset=tex1Dfetch(tBucketOffsets,bid);
}
__syncthreads();
if(tid < count)
shared[tid] = datas[tid+offset];
else
{
dPad(shared[tid]);
}
__syncthreads();
for (unsigned int k = 2; k <= BLOCK_SIZE; k *= 2)
{
for (unsigned int j = k / 2; j>0; j /= 2)
{
unsigned int ixj = tid ^ j;
if (ixj > tid)
{
if ((tid & k) == 0)
{
if (dGetValue(shared[tid]) > dGetValue(shared[ixj]))
{
dSwap(shared[tid], shared[ixj]);
}
}
else
{
if (dGetValue(shared[tid]) < dGetValue(shared[ixj]))
{
dSwap(shared[tid], shared[ixj]);
}
}
}
__syncthreads();
}
}
if(tid < count)
datas[tid+offset] = shared[tid];
}
template <typename T>
__global__ void assignElementToSlicesD(T* dDatas,int number,unsigned int* dSliceCounts,unsigned int* dOffsetInSlice,float minValue,float step,int sliceSize)
{
unsigned int index= __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if(index > number-1)
return ;
unsigned int s=((dGetValue(dDatas[index]) - minValue)/ step);
unsigned int offset=atomicInc(dSliceCounts + s,0xFFFFFFF);
dOffsetInSlice[index] = offset;
}
template <typename T>
__global__ void assignElementToSlicesNearlySortedD(T* dDatas,int number,unsigned int* dSliceCounts,unsigned int* dOffsetInSlice,float minValue,float step,int sliceSize,int blockCount)
{
unsigned int index= blockIdx.x + blockCount * threadIdx.x;
if(index > number-1)
return ;
unsigned int s=((dGetValue(dDatas[index]) - minValue)/ step);
unsigned int offset=atomicInc(dSliceCounts + s,0xFFFFFFF);
dOffsetInSlice[index] = offset;
}
template <typename T>
__global__ void assignElementToBucketD(T* dDatas,T* dNewDatas,int number,unsigned int* dOffsetInSlice,float minValue,float step)
{
unsigned int index= __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if(index > number-1)
return ;
unsigned int s=((dGetValue(dDatas[index]) - minValue)/ step);
unsigned int b=tex1Dfetch(tBucketOfSlices,s);
unsigned int offset =tex1Dfetch(tBucketOffsets,b) + tex1Dfetch(tSliceOffsetInBucket,s) + dOffsetInSlice[index];
dNewDatas[offset] =dDatas[index];
}
#endif // _BBSORT_KERNEL_H_
......@@ -367,6 +367,8 @@ struct cudaGmxSimulation {
int4* pPmeParticleIndex; // The grid indices for each atom
float4* pPmeParticleFraction; // Fractional offset in the grid for each atom in all three dimensions.
int* pPmeInteractionFlags; // Flags for which groups of grid points interact with which atoms
int* pPmeAtomRange; // The range of sorted atoms at each grid point
float2* pPmeAtomGridIndex; // The grid point each atom is at
unsigned int bonds; // Number of bonds
int4* pBondID; // Bond atom and output buffer IDs
float2* pBondParameter; // Bond parameters
......
......@@ -786,6 +786,10 @@ void gpuSetPMEParameters(gpuContext gpu, float alpha)
gpu->sim.pPmeParticleFraction = gpu->psPmeParticleFraction->_pDevData;
gpu->psPmeInteractionFlags = new CUDAStream<int>(totalGroups*(gpu->sim.paddedNumberOfAtoms/32), 1, "PmeInteractionFlags");
gpu->sim.pPmeInteractionFlags = gpu->psPmeInteractionFlags->_pDevData;
gpu->psPmeAtomRange = new CUDAStream<int>(gridSize.x*gridSize.y*gridSize.z+1, 1, "PmeAtomRange");
gpu->sim.pPmeAtomRange = gpu->psPmeAtomRange->_pDevData;
gpu->psPmeAtomGridIndex = new CUDAStream<float2>(gpu->natoms, 1, "PmeAtomGridIndex");
gpu->sim.pPmeAtomGridIndex = gpu->psPmeAtomGridIndex->_pDevData;
// Initialize the b-spline moduli.
......@@ -1686,6 +1690,8 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
gpu->psPmeBsplineDtheta = NULL;
gpu->psPmeParticleIndex = NULL;
gpu->psPmeParticleFraction = NULL;
gpu->psPmeAtomRange = NULL;
gpu->psPmeAtomGridIndex = NULL;
gpu->psShakeID = NULL;
gpu->psShakeParameter = NULL;
gpu->psSettleID = NULL;
......@@ -1853,6 +1859,8 @@ void gpuShutDown(gpuContext gpu)
delete gpu->psPmeBsplineDtheta;
delete gpu->psPmeParticleIndex;
delete gpu->psPmeParticleFraction;
delete gpu->psPmeAtomRange;
delete gpu->psPmeAtomGridIndex;
cufftDestroy(gpu->fftplan);
}
delete gpu->psObcData;
......
......@@ -113,6 +113,8 @@ struct _gpuContext {
CUDAStream<int4>* psPmeParticleIndex; // The grid indices for each atom
CUDAStream<float4>* psPmeParticleFraction; // Fractional offset in the grid for each atom in all three dimensions.
CUDAStream<int>* psPmeInteractionFlags; // Flags for which groups of grid points interact with which atoms
CUDAStream<int>* psPmeAtomRange; // The range of sorted atoms at each grid point
CUDAStream<float2>* psPmeAtomGridIndex; // The grid point each atom is at
CUDAStream<float2>* psObcData;
CUDAStream<float>* psObcChain;
CUDAStream<float>* psBornForce;
......
......@@ -25,6 +25,7 @@
* -------------------------------------------------------------------------- */
#include "gputypes.h"
#include "bbsort.h"
#include <cuda.h>
using namespace std;
......@@ -106,48 +107,86 @@ __global__ void kUpdateGridIndexAndFraction_kernel()
fast_mod(__float2int_rd(tix.y), cSim.pmeGridSize.y),
fast_mod(__float2int_rd(tix.z), cSim.pmeGridSize.z), 0);
cSim.pPmeParticleIndex[i] = itmp;
cSim.pPmeAtomGridIndex[i] = make_float2(i, itmp.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+itmp.y*cSim.pmeGridSize.z+itmp.z);
}
//
// // Compute flags for which atoms affect which groups of grid points.
//
// const int3 numGroups = make_int3((cSim.pmeGridSize.x+cSim.pmeGroupSize.x-1)/cSim.pmeGroupSize.x, (cSim.pmeGridSize.y+cSim.pmeGroupSize.y-1)/cSim.pmeGroupSize.y, (cSim.pmeGridSize.z+cSim.pmeGroupSize.z-1)/cSim.pmeGroupSize.z);
// const unsigned int totalGroups = numGroups.x*numGroups.y*numGroups.z;
// const float3 gridScale = make_float3(cSim.pmeGridSize.x/cSim.periodicBoxSizeX, cSim.pmeGridSize.y/cSim.periodicBoxSizeY, cSim.pmeGridSize.z/cSim.periodicBoxSizeZ);
// for (int group = tid; group < totalGroups; group += tnb)
// {
// int3 gridBase;
// gridBase.x = group/(numGroups.y*numGroups.z);
// int remainder = group-gridBase.x*numGroups.y*numGroups.z;
// gridBase.y = remainder/numGroups.z;
// gridBase.z = remainder-gridBase.y*numGroups.z;
// gridBase.x *= cSim.pmeGroupSize.x;
// gridBase.y *= cSim.pmeGroupSize.y;
// gridBase.z *= cSim.pmeGroupSize.z;
// unsigned int flags = 0;
// unsigned int baseIndex = group*(cSim.paddedNumberOfAtoms/32);
// for (int atomBlock = 0; atomBlock < cSim.paddedNumberOfAtoms>>GRIDBITS; atomBlock++)
// {
// // Decide if this block actually needs to be processed.
//
// int flagIndex = atomBlock%32;
// if (flagIndex == 0)
// flags = 0;
// float4 boxSize = cSim.pGridBoundingBox[atomBlock];
// float4 center = cSim.pGridCenter[atomBlock];
// int maxx = (int) ceil((center.x+boxSize.x)*gridScale.x)+cSim.pmeGroupSize.x+PME_ORDER;
// int maxy = (int) ceil((center.y+boxSize.y)*gridScale.y)+cSim.pmeGroupSize.y+PME_ORDER;
// int maxz = (int) ceil((center.z+boxSize.z)*gridScale.z)+cSim.pmeGroupSize.z+PME_ORDER;
// int minx = (int) floor((center.x-boxSize.x)*gridScale.x);
// int miny = (int) floor((center.y-boxSize.y)*gridScale.y);
// int minz = (int) floor((center.z-boxSize.z)*gridScale.z);
// int x = minx+(gridBase.x-minx)%cSim.pmeGridSize.x;
// int y = miny+(gridBase.y-miny)%cSim.pmeGridSize.y;
// int z = minz+(gridBase.z-minz)%cSim.pmeGridSize.z;
// if (maxx < x || maxy < y || maxz < z)
// flags += 1<<flagIndex;
// if (flagIndex == 31 || atomBlock == cSim.paddedNumberOfAtoms>>GRIDBITS)
// cSim.pPmeInteractionFlags[baseIndex+atomBlock/32] = flags;
// }
// }
}
// Compute flags for which atoms affect which groups of grid points.
/**
* For each grid point, find the range of sorted atoms associated with that point.
*/
const int3 numGroups = make_int3((cSim.pmeGridSize.x+cSim.pmeGroupSize.x-1)/cSim.pmeGroupSize.x, (cSim.pmeGridSize.y+cSim.pmeGroupSize.y-1)/cSim.pmeGroupSize.y, (cSim.pmeGridSize.z+cSim.pmeGroupSize.z-1)/cSim.pmeGroupSize.z);
const unsigned int totalGroups = numGroups.x*numGroups.y*numGroups.z;
const float3 gridScale = make_float3(cSim.pmeGridSize.x/cSim.periodicBoxSizeX, cSim.pmeGridSize.y/cSim.periodicBoxSizeY, cSim.pmeGridSize.z/cSim.periodicBoxSizeZ);
for (int group = tid; group < totalGroups; group += tnb)
__global__ void kFindAtomRangeForGrid_kernel()
{
int thread = blockIdx.x*blockDim.x+threadIdx.x;
int start = (cSim.atoms*thread)/(blockDim.x*gridDim.x);
int end = (cSim.atoms*(thread+1))/(blockDim.x*gridDim.x);
int last = (thread == 0 ? -1 : cSim.pPmeAtomGridIndex[start-1].y);
for (int i = start; i < end; ++i)
{
int3 gridBase;
gridBase.x = group/(numGroups.y*numGroups.z);
int remainder = group-gridBase.x*numGroups.y*numGroups.z;
gridBase.y = remainder/numGroups.z;
gridBase.z = remainder-gridBase.y*numGroups.z;
gridBase.x *= cSim.pmeGroupSize.x;
gridBase.y *= cSim.pmeGroupSize.y;
gridBase.z *= cSim.pmeGroupSize.z;
unsigned int flags = 0;
unsigned int baseIndex = group*(cSim.paddedNumberOfAtoms/32);
for (int atomBlock = 0; atomBlock < cSim.paddedNumberOfAtoms>>GRIDBITS; atomBlock++)
float2 atomData = cSim.pPmeAtomGridIndex[i];
int gridIndex = atomData.y;
if (gridIndex != last)
{
// Decide if this block actually needs to be processed.
int flagIndex = atomBlock%32;
if (flagIndex == 0)
flags = 0;
float4 boxSize = cSim.pGridBoundingBox[atomBlock];
float4 center = cSim.pGridCenter[atomBlock];
int maxx = (int) ceil((center.x+boxSize.x)*gridScale.x)+cSim.pmeGroupSize.x+PME_ORDER;
int maxy = (int) ceil((center.y+boxSize.y)*gridScale.y)+cSim.pmeGroupSize.y+PME_ORDER;
int maxz = (int) ceil((center.z+boxSize.z)*gridScale.z)+cSim.pmeGroupSize.z+PME_ORDER;
int minx = (int) floor((center.x-boxSize.x)*gridScale.x);
int miny = (int) floor((center.y-boxSize.y)*gridScale.y);
int minz = (int) floor((center.z-boxSize.z)*gridScale.z);
int x = minx+(gridBase.x-minx)%cSim.pmeGridSize.x;
int y = miny+(gridBase.y-miny)%cSim.pmeGridSize.y;
int z = minz+(gridBase.z-minz)%cSim.pmeGridSize.z;
if (maxx < x || maxy < y || maxz < z)
flags += 1<<flagIndex;
if (flagIndex == 31 || atomBlock == cSim.paddedNumberOfAtoms>>GRIDBITS)
cSim.pPmeInteractionFlags[baseIndex+atomBlock/32] = flags;
for (int j = last+1; j <= gridIndex; ++j)
cSim.pPmeAtomRange[j] = i;
last = gridIndex;
}
// The grid index won't be needed again. Reuse that component to hold the atom charge, thus saving
// an extra load operation in the charge spreading kernel.
cSim.pPmeAtomGridIndex[i].y = cSim.pPosq[(int) atomData.x].w;
}
// Fill in values beyond the last atom.
if (thread == blockDim.x*gridDim.x-1)
{
int gridSize = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
for (int j = last+1; j <= gridSize; ++j)
cSim.pPmeAtomRange[j] = cSim.atoms;
}
}
......@@ -217,79 +256,116 @@ __global__ void kUpdateBsplines_kernel()
}
}
//__global__ void kGridSpreadCharge_kernel()
//{
// extern __shared__ float atomCharge[];
// int4* atomGridIndex = (int4*) &atomCharge[blockDim.x];
// const unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
// const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
// const int3 numGroups = make_int3((cSim.pmeGridSize.x+cSim.pmeGroupSize.x-1)/cSim.pmeGroupSize.x, (cSim.pmeGridSize.y+cSim.pmeGroupSize.y-1)/cSim.pmeGroupSize.y, (cSim.pmeGridSize.z+cSim.pmeGroupSize.z-1)/cSim.pmeGroupSize.z);
// const unsigned int totalGroups = numGroups.x*numGroups.y*numGroups.z;
// unsigned int group = warp*totalGroups/totalWarps;
// const unsigned int end = (warp+1)*totalGroups/totalWarps;
// const unsigned int index = threadIdx.x & (GRID - 1);
//
// while (group < end)
// {
// // Process a group of grid points of size cSim.pmeGroupSize. First figure out the base index for the group,
// // and the index of the specific point this thread will handle.
//
// int3 gridBase;
// gridBase.x = group/(numGroups.y*numGroups.z);
// int remainder = group-gridBase.x*numGroups.y*numGroups.z;
// gridBase.y = remainder/numGroups.z;
// gridBase.z = remainder-gridBase.y*numGroups.z;
// gridBase.x *= cSim.pmeGroupSize.x;
// gridBase.y *= cSim.pmeGroupSize.y;
// gridBase.z *= cSim.pmeGroupSize.z;
// int3 gridPoint;
// gridPoint.x = index/(cSim.pmeGroupSize.y*cSim.pmeGroupSize.z);
// remainder = index-gridPoint.x*cSim.pmeGroupSize.y*cSim.pmeGroupSize.z;
// gridPoint.y = remainder/cSim.pmeGroupSize.z;
// gridPoint.z = remainder-gridPoint.y*cSim.pmeGroupSize.z;
// gridPoint.x += gridBase.x;
// gridPoint.y += gridBase.y;
// gridPoint.z += gridBase.z;
//
// // Loop over blocks of atoms.
//
// float result = 0.0f;
// int flags = 0;
// unsigned int baseIndex = group*(cSim.paddedNumberOfAtoms/32);
// for (int atomBlock = 0; atomBlock < cSim.paddedNumberOfAtoms>>GRIDBITS; atomBlock++)
// {
// // Decide if this block actually needs to be processed.
//
// int flagIndex = atomBlock%32;
// if (flagIndex == 0)
// flags = cSim.pPmeInteractionFlags[baseIndex+atomBlock/32];
// if ((flags & (1<<flagIndex)) != 0)
// continue;
// int atomIndex = (atomBlock<<GRIDBITS)+index;
// if (atomIndex < cSim.atoms)
// {
// atomCharge[threadIdx.x] = cSim.pPosq[atomIndex].w;
// atomGridIndex[threadIdx.x] = cSim.pPmeParticleIndex[atomIndex];
// }
// int maxAtoms = min(GRID, cSim.atoms-(atomBlock<<GRIDBITS));
// for (int i = 0; i < maxAtoms; i++)
// {
// int localIndex = threadIdx.x-index+i;
// int atomIndex = (atomBlock<<GRIDBITS)+i;
// int ix = gridPoint.x-atomGridIndex[localIndex].x;
// int iy = gridPoint.y-atomGridIndex[localIndex].y;
// int iz = gridPoint.z-atomGridIndex[localIndex].z;
// ix += (ix < 0 ? cSim.pmeGridSize.x : 0);
// iy += (iy < 0 ? cSim.pmeGridSize.y : 0);
// iz += (iz < 0 ? cSim.pmeGridSize.z : 0);
// if (ix < PME_ORDER && iy < PME_ORDER && iz < PME_ORDER)
// result += atomCharge[threadIdx.x-index+i]*cSim.pPmeBsplineTheta[atomIndex+ix*cSim.atoms].x*cSim.pPmeBsplineTheta[atomIndex+iy*cSim.atoms].y*cSim.pPmeBsplineTheta[atomIndex+iz*cSim.atoms].z;
// }
// }
// unsigned int gridIndex = gridPoint.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+gridPoint.y*cSim.pmeGridSize.z+gridPoint.z;
// if (gridIndex < cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z)
// cSim.pPmeGrid[gridIndex] = make_cuComplex(result*sqrt(cSim.epsfac), 0.0f);
// group++;
// }
//}
__global__ void kGridSpreadCharge_kernel()
{
extern __shared__ float atomCharge[];
int4* atomGridIndex = (int4*) &atomCharge[blockDim.x];
const unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
const int3 numGroups = make_int3((cSim.pmeGridSize.x+cSim.pmeGroupSize.x-1)/cSim.pmeGroupSize.x, (cSim.pmeGridSize.y+cSim.pmeGroupSize.y-1)/cSim.pmeGroupSize.y, (cSim.pmeGridSize.z+cSim.pmeGroupSize.z-1)/cSim.pmeGroupSize.z);
const unsigned int totalGroups = numGroups.x*numGroups.y*numGroups.z;
unsigned int group = warp*totalGroups/totalWarps;
const unsigned int end = (warp+1)*totalGroups/totalWarps;
const unsigned int index = threadIdx.x & (GRID - 1);
while (group < end)
unsigned int numGridPoints = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
unsigned int numThreads = gridDim.x*blockDim.x;
for (int gridIndex = blockIdx.x*blockDim.x+threadIdx.x; gridIndex < numGridPoints; gridIndex += numThreads)
{
// Process a group of grid points of size cSim.pmeGroupSize. First figure out the base index for the group,
// and the index of the specific point this thread will handle.
int3 gridBase;
gridBase.x = group/(numGroups.y*numGroups.z);
int remainder = group-gridBase.x*numGroups.y*numGroups.z;
gridBase.y = remainder/numGroups.z;
gridBase.z = remainder-gridBase.y*numGroups.z;
gridBase.x *= cSim.pmeGroupSize.x;
gridBase.y *= cSim.pmeGroupSize.y;
gridBase.z *= cSim.pmeGroupSize.z;
int3 gridPoint;
gridPoint.x = index/(cSim.pmeGroupSize.y*cSim.pmeGroupSize.z);
remainder = index-gridPoint.x*cSim.pmeGroupSize.y*cSim.pmeGroupSize.z;
gridPoint.y = remainder/cSim.pmeGroupSize.z;
gridPoint.z = remainder-gridPoint.y*cSim.pmeGroupSize.z;
gridPoint.x += gridBase.x;
gridPoint.y += gridBase.y;
gridPoint.z += gridBase.z;
// Loop over blocks of atoms.
gridPoint.x = gridIndex/(cSim.pmeGridSize.y*cSim.pmeGridSize.z);
int remainder = gridIndex-gridPoint.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
gridPoint.y = remainder/cSim.pmeGridSize.z;
gridPoint.z = remainder-gridPoint.y*cSim.pmeGridSize.z;
gridPoint.x += cSim.pmeGridSize.x;
gridPoint.y += cSim.pmeGridSize.y;
gridPoint.z += cSim.pmeGridSize.z;
float result = 0.0f;
int flags = 0;
unsigned int baseIndex = group*(cSim.paddedNumberOfAtoms/32);
for (int atomBlock = 0; atomBlock < cSim.paddedNumberOfAtoms>>GRIDBITS; atomBlock++)
for (int ix = 0; ix < PME_ORDER; ++ix)
for (int iy = 0; iy < PME_ORDER; ++iy)
for (int iz = 0; iz < PME_ORDER; ++iz)
{
// Decide if this block actually needs to be processed.
int flagIndex = atomBlock%32;
if (flagIndex == 0)
flags = cSim.pPmeInteractionFlags[baseIndex+atomBlock/32];
if ((flags & (1<<flagIndex)) != 0)
continue;
int atomIndex = (atomBlock<<GRIDBITS)+index;
if (atomIndex < cSim.atoms)
{
atomCharge[threadIdx.x] = cSim.pPosq[atomIndex].w;
atomGridIndex[threadIdx.x] = cSim.pPmeParticleIndex[atomIndex];
}
int maxAtoms = min(GRID, cSim.atoms-(atomBlock<<GRIDBITS));
for (int i = 0; i < maxAtoms; i++)
int x = (gridPoint.x-ix)%cSim.pmeGridSize.x;
int y = (gridPoint.y-iy)%cSim.pmeGridSize.y;
int z = (gridPoint.z-iz)%cSim.pmeGridSize.z;
int gridIndex = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+z;
int firstAtom = cSim.pPmeAtomRange[gridIndex];
int lastAtom = cSim.pPmeAtomRange[gridIndex+1];
for (int i = firstAtom; i < lastAtom; ++i)
{
int localIndex = threadIdx.x-index+i;
int atomIndex = (atomBlock<<GRIDBITS)+i;
int ix = gridPoint.x-atomGridIndex[localIndex].x;
int iy = gridPoint.y-atomGridIndex[localIndex].y;
int iz = gridPoint.z-atomGridIndex[localIndex].z;
ix += (ix < 0 ? cSim.pmeGridSize.x : 0);
iy += (iy < 0 ? cSim.pmeGridSize.y : 0);
iz += (iz < 0 ? cSim.pmeGridSize.z : 0);
if (ix < PME_ORDER && iy < PME_ORDER && iz < PME_ORDER)
result += atomCharge[threadIdx.x-index+i]*cSim.pPmeBsplineTheta[atomIndex+ix*cSim.atoms].x*cSim.pPmeBsplineTheta[atomIndex+iy*cSim.atoms].y*cSim.pPmeBsplineTheta[atomIndex+iz*cSim.atoms].z;
float2 atomData = cSim.pPmeAtomGridIndex[i];
int atomIndex = atomData.x;
float atomCharge = atomData.y;
result += atomCharge*cSim.pPmeBsplineTheta[atomIndex+ix*cSim.atoms].x*cSim.pPmeBsplineTheta[atomIndex+iy*cSim.atoms].y*cSim.pPmeBsplineTheta[atomIndex+iz*cSim.atoms].z;
}
}
unsigned int gridIndex = gridPoint.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+gridPoint.y*cSim.pmeGridSize.z+gridPoint.z;
if (gridIndex < cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z)
cSim.pPmeGrid[gridIndex] = make_cuComplex(result*sqrt(cSim.epsfac), 0.0f);
group++;
}
}
......@@ -370,11 +446,17 @@ void kCalculatePME(gpuContext gpu)
// printf("kCalculatePME\n");
kUpdateGridIndexAndFraction_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kUpdateGridIndexAndFraction");
bbSort(gpu->psPmeAtomGridIndex->_pDevData, gpu->natoms);
kFindAtomRangeForGrid_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kFindAtomRangeForGrid");
unsigned int threads = 16380/(2*PME_ORDER*sizeof(float4));
kUpdateBsplines_kernel<<<gpu->sim.blocks, threads, 2*threads*PME_ORDER*sizeof(float4)>>>();
LAUNCHERROR("kUpdateBsplines");
kGridSpreadCharge_kernel<<<gpu->sim.blocks, 64, 64*(sizeof(float)+sizeof(int4))>>>();
LAUNCHERROR("kGridSpreadCharge");
// gpu->psPmeGrid->Download();
// for (int i = 0; i < gpu->psPmeGrid->_length; i += 100)
// printf("%d %f %f\n", i, (*gpu->psPmeGrid)[i].x, (*gpu->psPmeGrid)[i].y);
cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_FORWARD);
kReciprocalConvolution_kernel<<<gpu->sim.blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kReciprocalConvolution");
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment