Implemented a new (much faster) charge spreading algorithm for PME

68ec5df7 · Peter Eastman · 59be3426 · 68ec5df7 · 68ec5df7 · 68ec5df7
Commit 68ec5df7 authored Sep 03, 2009 by Peter Eastman
8 changed files
--- a/platforms/cuda/cuda-cmake/FindCuda.cmake
+++ b/platforms/cuda/cuda-cmake/FindCuda.cmake
@@ -117,7 +117,7 @@ ELSE(CUDA_BUILD_TYPE MATCHES "Emulation")
 ENDIF(CUDA_BUILD_TYPE MATCHES "Emulation")

 SET(CUDA_BUILD_CUBIN TRUE CACHE BOOL "Generate and parse .cubin files in Device mode.")
-SET(CUDA_NVCC_FLAGS "-maxrregcount=32;-use_fast_math;-O0" CACHE STRING "Semi-colon delimit multiple arguments.")
+SET(CUDA_NVCC_FLAGS "-maxrregcount=32;-use_fast_math;-O0;-arch=sm_11" CACHE STRING "Semi-colon delimit multiple arguments.")

 # Search for the cuda distribution.
 IF(NOT CUDA_INSTALL_PREFIX)

--- a/platforms/cuda/src/kernels/bbsort.cu
+++ b/platforms/cuda/src/kernels/bbsort.cu
+/*
+ * Authored by: Chen, Shifu
+ * 
+ * Email: chen@gmtk.org
+ *
+ * Website: http://www.gmtk.org/gsort
+ *
+ * The code is distributed under BSD license, you are allowed to use, modify or sell this code, but a statement is required if you used this code any where.
+ * 
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "vector_types.h"
+#include "bbsort.h"
+#include "bbsort_kernel.cu"
+
+
+float getValue(float2 v){
+	return v.y;
+}
+
+template <typename T>
+T getValue(T v){
+	return v;
+}
+
+#  define CUDA_SAFE_CALL_NO_SYNC( call) {                                    \
+    cudaError err = call;                                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
+                __FILE__, __LINE__, cudaGetErrorString( err) );              \
+        exit(EXIT_FAILURE);                                                  \
+    } }
+
+#  define CUDA_SAFE_CALL( call)     CUDA_SAFE_CALL_NO_SYNC(call);
+
+bool assignSliceToBuckets(unsigned int* sliceCount,int sliceSize,unsigned int* bucketOffset,unsigned int* bucketOfSlice,unsigned int* bucketSizes,unsigned int* sliceOffsetInBucket,int& bucketsCount,float step)
+{
+	int i=0;
+
+	bool overflow=false;
+
+	int tmpSum=0;
+
+	bucketOffset[0]=0;
+
+	for(i=0;i<sliceSize; i++){
+		if(sliceCount[i] >BLOCK_SIZE)
+		{
+			overflow=true;
+		}
+
+		tmpSum += sliceCount[i];
+		bucketOfSlice[i]=bucketsCount;
+		bucketSizes[bucketsCount] = tmpSum;
+		sliceOffsetInBucket[i]=tmpSum -sliceCount[i];
+		if(tmpSum > BLOCK_SIZE )
+		{	
+			if(i != 0)
+			{
+				bucketOfSlice[i]=bucketsCount+1;
+				bucketSizes[bucketsCount] -= sliceCount[i];
+				sliceOffsetInBucket[i]=0;
+				bucketOffset[bucketsCount+1]=bucketOffset[bucketsCount] + tmpSum -  sliceCount[i];
+
+				bucketsCount++;
+				tmpSum=sliceCount[i];
+				bucketSizes[bucketsCount] = tmpSum;
+			}
+			else 
+			{
+				bucketOffset[bucketsCount+1]=bucketOffset[bucketsCount] + tmpSum ;
+				sliceOffsetInBucket[i]=0;
+				tmpSum=0;
+				bucketsCount++;
+			}
+		}
+
+	}
+	bucketsCount++;
+
+	return overflow;
+
+}
+
+template <typename T>
+void reduceMinMax(T* dData,int size,float& result,bool isMax)
+{
+
+	int step;
+	step=(size%2==0)?
+		(size/2):(size/2 +1);
+	int blockSize=BLOCK_SIZE;
+	int blockCount;
+	int length=size;
+	T originalResult;
+	while(step > 0)
+	{
+		if(step%BLOCK_SIZE==0)
+			blockCount=step/BLOCK_SIZE;
+		else 
+			blockCount=step/BLOCK_SIZE+1;
+
+		if(isMax)
+			reduceMaxD<<<blockCount,blockSize>>>(dData,step,length);
+		else 
+			reduceMinD<<<blockCount,blockSize>>>(dData,step,length);
+
+		length=step;
+
+		step=(step%2==0 || step==1)?(step/2):(step/2 +1);
+	}
+
+	CUDA_SAFE_CALL(cudaMemcpy(&originalResult, dData, sizeof(T), cudaMemcpyDeviceToHost));
+
+	result=(float)getValue(originalResult);
+}
+
+template <typename T>
+void evaluateDisorder(T* dData,int size,float maxValue, float minValue, int& listOrder)
+{
+	int blockCount;
+
+	if((size-1) % BLOCK_SIZE ==0)blockCount=size/BLOCK_SIZE;
+	else blockCount=size/BLOCK_SIZE+1;
+
+	float* dDiffData;
+	CUDA_SAFE_CALL(cudaMalloc((void**)&dDiffData, sizeof(float) * size));
+
+	calDifferenceD<<<blockCount,BLOCK_SIZE,(BLOCK_SIZE)*sizeof(T)>>>(dData,dDiffData,size);
+
+	float sum=0;
+
+	int step;
+	step=(size%2==0)?
+		(size/2):(size/2 +1);
+
+	int blockSize=BLOCK_SIZE;
+
+	int length=size;
+
+	while(step > 0)
+	{
+
+		if(step%BLOCK_SIZE==0)
+			blockCount=step/BLOCK_SIZE;
+		else 
+			blockCount=step/BLOCK_SIZE+1;
+
+		reduceSumD<<<blockCount,blockSize>>>(dDiffData,step,length);
+
+		length=step;
+
+		step=(step%2==0 || step==1)?(step/2):(step/2 +1);
+	}
+
+	CUDA_SAFE_CALL(cudaMemcpy(&sum, dDiffData, sizeof(float), cudaMemcpyDeviceToHost));
+
+	if( sum < (maxValue - minValue) * size / 10)
+		listOrder=NEARLY_SORTED;
+	else 
+		listOrder=DISORDERLY;
+
+	CUDA_SAFE_CALL(cudaFree(dDiffData));
+}
+
+template <typename T>
+void bbSortBody(T* dData,int size,int listOrder/*,float sliceStep,int sliceSize, T* dTmpData, float minValue,float maxValue*/)
+{
+	float minValue,maxValue;
+	T*  dTmpData;
+
+	CUDA_SAFE_CALL(cudaMalloc((void**)&dTmpData, sizeof(T) * size));
+	CUDA_SAFE_CALL(cudaMemcpy(dTmpData, dData, sizeof(T) * size, cudaMemcpyDeviceToDevice));
+	reduceMinMax(dTmpData,size,maxValue,true);
+	CUDA_SAFE_CALL(cudaMemcpy(dTmpData, dData, sizeof(T) * size, cudaMemcpyDeviceToDevice));
+	reduceMinMax(dTmpData,size,minValue,false);
+
+	if(minValue == maxValue)
+	{
+		CUDA_SAFE_CALL(cudaFree(dTmpData));
+		return ;
+	}
+
+	if(listOrder == AUTO_EVALUATE )
+	{
+		evaluateDisorder(dData,size,maxValue,minValue,listOrder);
+	}
+	
+	float sliceStep =(50.0*((double)(maxValue-minValue)/(double)size));
+	int sliceSize = (maxValue-minValue)/sliceStep + 10;
+
+	int blockCount;
+
+	if(size%BLOCK_SIZE==0)blockCount=size/BLOCK_SIZE;
+	else blockCount=size/BLOCK_SIZE+1;
+
+	unsigned int* dSliceCounts;
+	unsigned int* dOffsetInSlice;
+
+	CUDA_SAFE_CALL(cudaMalloc((void**)&dOffsetInSlice, sizeof(unsigned int) * size));
+	CUDA_SAFE_CALL(cudaMalloc((void**)&dSliceCounts, sizeof(unsigned int) * sliceSize));
+	CUDA_SAFE_CALL(cudaMemset(dSliceCounts,0, sizeof(int) * sliceSize));
+
+	if(listOrder == NEARLY_SORTED)
+	{
+		assignElementToSlicesNearlySortedD<<<blockCount, BLOCK_SIZE>>>(dData,size,dSliceCounts,dOffsetInSlice,minValue,sliceStep,sliceSize,blockCount);
+	}
+	else 
+		assignElementToSlicesD<<<blockCount, BLOCK_SIZE>>>(dData,size,dSliceCounts,dOffsetInSlice,minValue,sliceStep,sliceSize);
+	unsigned int* hSliceCounts=new unsigned int[sliceSize];
+	CUDA_SAFE_CALL(cudaMemcpy(hSliceCounts, dSliceCounts, sizeof(unsigned int) * sliceSize, cudaMemcpyDeviceToHost));
+
+	int looseBucketSize=size/100;
+
+	unsigned int* hBucketOffsets=new unsigned int[looseBucketSize];
+	unsigned int* hBucketSizes=new unsigned int[looseBucketSize];
+	unsigned int* hBucketOfSlices=new unsigned int[sliceSize];
+	unsigned int* hSliceOffsetInBucket=new unsigned int[sliceSize];
+	int bucketsCount=0;
+
+	memset(hBucketSizes,0,sizeof(int) * looseBucketSize);
+	memset(hSliceOffsetInBucket,0,sizeof(unsigned int) * sliceSize);
+
+	bool overflow;
+
+	overflow = assignSliceToBuckets(hSliceCounts,sliceSize,hBucketOffsets,hBucketOfSlices,hBucketSizes,hSliceOffsetInBucket,bucketsCount,sliceStep);
+
+	unsigned int* dBucketOffsets;
+	unsigned int* dBucketSizes;
+
+	unsigned int* dBucketOfSlices;
+	unsigned int* dSliceOffsetInBucket;
+
+	CUDA_SAFE_CALL(cudaMalloc((void**)&dBucketOfSlices, sizeof(unsigned int) * sliceSize));
+	CUDA_SAFE_CALL(cudaMalloc((void**)&dSliceOffsetInBucket, sizeof(unsigned int) * sliceSize));
+	CUDA_SAFE_CALL(cudaMalloc((void**)&dBucketOffsets, sizeof(unsigned int) * bucketsCount));
+	CUDA_SAFE_CALL(cudaMalloc((void**)&dBucketSizes, sizeof(unsigned int) * bucketsCount));
+
+
+	CUDA_SAFE_CALL(cudaMemcpy(dBucketOfSlices, hBucketOfSlices, sizeof(unsigned int) * sliceSize, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpy(dSliceOffsetInBucket, hSliceOffsetInBucket, sizeof(unsigned int) * sliceSize, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpy(dBucketOffsets, hBucketOffsets, sizeof(unsigned int) * bucketsCount, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpy(dBucketSizes, hBucketSizes, sizeof(unsigned int) * bucketsCount, cudaMemcpyHostToDevice));
+
+	cudaBindTexture(0,tBucketOffsets,dBucketOffsets);
+	cudaBindTexture(0,tBucketSizes,dBucketSizes);
+	cudaBindTexture(0,tBucketOfSlices,dBucketOfSlices);
+	cudaBindTexture(0,tSliceOffsetInBucket,dSliceOffsetInBucket);
+
+	assignElementToBucketD<<<blockCount, BLOCK_SIZE>>>(dData,dTmpData,size,dOffsetInSlice,minValue,sliceStep);
+
+	CUDA_SAFE_CALL( cudaThreadSynchronize() );
+
+	bitonicSortD<<<bucketsCount, BLOCK_SIZE, sizeof(T) * BLOCK_SIZE>>>(dTmpData);
+
+    CUDA_SAFE_CALL(cudaMemcpy(dData, dTmpData, sizeof(T) * size, cudaMemcpyDeviceToDevice));
+	
+	if(overflow){
+		for(int i=0;i<bucketsCount;i++)
+		{
+			if(hBucketSizes[i] > BLOCK_SIZE)
+			{
+				bbSort(dData + hBucketOffsets[i],hBucketSizes[i],listOrder);
+			}
+		}
+	}
+	
+	delete hBucketOffsets;
+	delete hBucketOfSlices;
+	delete hSliceCounts;
+	delete hBucketSizes;
+
+	CUDA_SAFE_CALL(cudaFree(dOffsetInSlice));
+	CUDA_SAFE_CALL(cudaFree(dSliceCounts));
+	CUDA_SAFE_CALL(cudaFree(dTmpData));
+
+	cudaUnbindTexture( tBucketSizes );
+	CUDA_SAFE_CALL(cudaFree(dBucketSizes));
+
+	cudaUnbindTexture( tBucketOffsets );
+	CUDA_SAFE_CALL(cudaFree(dBucketOffsets));
+
+	cudaUnbindTexture( tBucketOfSlices );
+	CUDA_SAFE_CALL(cudaFree(dBucketOfSlices));
+
+	cudaUnbindTexture( tSliceOffsetInBucket );
+	CUDA_SAFE_CALL(cudaFree(dSliceOffsetInBucket));
+}
+
+/************************************************************************************
+
+Uncomment your desired function definition here
+
+Please note that, only one type of bbsort() can be used in a program, due to NVCC compiler doesn't support overriding kernel function
+
+float, double, int, uint, short, and ushort are originally supported, if you want to use bbsort() in double
+
+please follow the readme.txt
+
+Also note that you need to use 1.3 capbility (use arch=sm_13 in your compile command) to sort doubles
+
+*************************************************************************************/
+
+template<>
+void bbSort(float2* dData,int size,int listOrder)
+{
+
+	bbSortBody(dData,size,listOrder);
+}
+
+//void bbSort(float* dData,int size,int listOrder)
+//{
+//
+//	bbSortBody(dData,size,listOrder);
+//}
+
+//void bbSort(int* dData,int size,int listOrder)
+//{
+//
+//	bbSortBody(dData,size,listOrder);
+//}
+//
+//void bbSort(unsigned int* dData,int size,int listOrder)
+//{
+//
+//	bbSortBody(dData,size,listOrder);
+//}
+//
+//void bbSort(double* dData,int size,int listOrder)
+//{
+//
+//	bbSortBody(dData,size,listOrder);
+//}
--- a/platforms/cuda/src/kernels/bbsort.h
+++ b/platforms/cuda/src/kernels/bbsort.h
+/*
+ * Authored by: Chen, Shifu
+ * 
+ * Email: chen@gmtk.org
+ *
+ * Website: http://www.gmtk.org/gsort
+ *
+ * The code is distributed under BSD license, you are allowed to use, modify or sell this code, but a statement is required if you used this code any where.
+ * 
+ */
+#ifndef _BBSORT_H_
+#define _BBSORT_H_
+
+#define BLOCK_SIZE 512
+
+#define DISORDERLY 0
+#define NEARLY_SORTED 1
+#define AUTO_EVALUATE 2
+
+template <typename T>
+void bbSort(T* dData,int number,int listOrder=AUTO_EVALUATE);
+
+#endif // _BBSORT_H_
--- a/platforms/cuda/src/kernels/bbsort_kernel.cu
+++ b/platforms/cuda/src/kernels/bbsort_kernel.cu
+/*
+ * Authored by: Chen, Shifu
+ * 
+ * Email: chen@gmtk.org
+ *
+ * Website: http://www.gmtk.org/gsort
+ *
+ * The code is distributed under BSD license, you are allowed to use, modify or sell this code, but a statement is required if you used this code any where.
+ * 
+ */
+#ifndef _BBSORT_KERNEL_H_
+#define _BBSORT_KERNEL_H_
+
+#include "bbsort.h"
+#include "math_constants.h"
+
+texture<unsigned int, 1, cudaReadModeElementType> tBucketSizes;
+texture<unsigned int, 1, cudaReadModeElementType> tBucketOffsets;
+texture<unsigned int, 1, cudaReadModeElementType> tBucketOfSlices;
+texture<unsigned int, 1, cudaReadModeElementType> tSliceOffsetInBucket;
+
+__device__ float dGetValue(float2 v){
+	return v.y;
+}
+
+template <typename T>
+__device__ T dGetValue(T v){
+	return v;
+}
+
+
+__device__ void dPad(float2& v){
+	v.x=0x3fffffff;
+	v.y=0x4fffffff;
+}
+
+template <typename T>
+__device__ void dPad(T & v){
+	v=0x7fffffff;
+}
+
+template <typename T>
+__global__ static void reduceMaxD(T * dData,int step,int length)
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if(index + step >=length)
+		return ;
+	dData[index] = dGetValue(dData[index])>dGetValue(dData[index+step])?dData[index]:dData[index+step];
+}
+
+template <typename T>
+__global__ static void reduceMinD(T * dData,int step,int length)
+{
+	
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if(index + step >=length)
+		return ;
+
+	dData[index] = dGetValue(dData[index])<dGetValue(dData[index+step])?dData[index]:dData[index+step];
+}
+
+__global__ static void reduceSumD(float * dDiffData,int step,int length)
+{
+
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if(index + step >=length)
+		return ;
+
+	dDiffData[index] += dDiffData[index+step];
+}
+
+template <typename T>
+__global__ static void calDifferenceD(T * dData,float * dDiffData,int size)
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if(index > size-1)
+		return ;
+
+    const unsigned int tid = threadIdx.x;
+	
+	extern __shared__ T sData[];
+
+	sData[tid]=dData[index];
+
+	__syncthreads();
+	
+	if(tid < blockDim.x -1)
+		dDiffData[index] = abs(dGetValue(sData[tid+1]) - dGetValue(sData[tid]));
+	else 
+		dDiffData[index] =0;
+	
+}
+
+template <typename T>
+__device__ inline void dSwap(T & a, T & b)
+{
+    T tmp = a;
+    a = b;
+    b = tmp;
+}
+
+
+template <typename T>
+__global__ static void bitonicSortD(T * datas)
+{
+    extern __shared__ T shared[];
+
+	const unsigned int bid=blockIdx.x;
+
+    const unsigned int tid = threadIdx.x;
+
+	__shared__ unsigned int count;
+	__shared__ unsigned int offset;
+
+	if(tid == 0)
+	{
+		count=tex1Dfetch(tBucketSizes,bid);
+		offset=tex1Dfetch(tBucketOffsets,bid);
+	}
+
+	__syncthreads();
+
+    if(tid < count)
+		shared[tid] = datas[tid+offset];
+	else 
+	{
+		dPad(shared[tid]);
+	}
+
+    __syncthreads();
+
+    for (unsigned int k = 2; k <= BLOCK_SIZE; k *= 2)
+    {
+        for (unsigned int j = k / 2; j>0; j /= 2)
+        {
+            unsigned int ixj = tid ^ j;
+            
+
+            if (ixj > tid)
+            {
+                if ((tid & k) == 0)
+                {
+                    if (dGetValue(shared[tid]) > dGetValue(shared[ixj]))
+                    {
+                        dSwap(shared[tid], shared[ixj]);
+                    }
+                }
+                else
+                {
+                    if (dGetValue(shared[tid]) < dGetValue(shared[ixj]))
+                    {
+                        dSwap(shared[tid], shared[ixj]);
+                    }
+                }
+            }
+            
+            __syncthreads();
+        }
+    }
+    if(tid < count)
+		datas[tid+offset] = shared[tid];
+}
+
+template <typename T>
+
+__global__ void assignElementToSlicesD(T* dDatas,int number,unsigned int* dSliceCounts,unsigned int* dOffsetInSlice,float minValue,float step,int sliceSize)
+{
+	unsigned int index= __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+
+	if(index > number-1)
+		return ;
+
+	unsigned int s=((dGetValue(dDatas[index]) - minValue)/ step);
+
+	unsigned int offset=atomicInc(dSliceCounts + s,0xFFFFFFF);
+
+	dOffsetInSlice[index] = offset;
+
+}
+
+template <typename T>
+__global__ void assignElementToSlicesNearlySortedD(T* dDatas,int number,unsigned int* dSliceCounts,unsigned int* dOffsetInSlice,float minValue,float step,int sliceSize,int blockCount)
+{
+	unsigned int index= blockIdx.x + blockCount * threadIdx.x;
+
+	if(index > number-1)
+		return ;
+
+	unsigned int s=((dGetValue(dDatas[index]) - minValue)/ step);
+
+	unsigned int offset=atomicInc(dSliceCounts + s,0xFFFFFFF);
+
+	dOffsetInSlice[index] = offset;
+
+}
+
+template <typename T>
+__global__ void assignElementToBucketD(T* dDatas,T*  dNewDatas,int number,unsigned int* dOffsetInSlice,float minValue,float step)
+{
+
+	unsigned int index= __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+
+	if(index > number-1)
+		return ;
+
+	unsigned int s=((dGetValue(dDatas[index]) - minValue)/ step);
+
+	unsigned int b=tex1Dfetch(tBucketOfSlices,s);
+
+	unsigned int offset =tex1Dfetch(tBucketOffsets,b) + tex1Dfetch(tSliceOffsetInBucket,s) + dOffsetInSlice[index];
+
+	dNewDatas[offset] =dDatas[index];
+
+}
+
+#endif // _BBSORT_KERNEL_H_
--- a/platforms/cuda/src/kernels/cudatypes.h
+++ b/platforms/cuda/src/kernels/cudatypes.h
@@ -367,6 +367,8 @@ struct cudaGmxSimulation {
    int4*           pPmeParticleIndex;              // The grid indices for each atom
    float4*         pPmeParticleFraction;           // Fractional offset in the grid for each atom in all three dimensions.
    int*            pPmeInteractionFlags;           // Flags for which groups of grid points interact with which atoms
+    int*            pPmeAtomRange;                  // The range of sorted atoms at each grid point
+    float2*         pPmeAtomGridIndex;              // The grid point each atom is at
    unsigned int    bonds;                          // Number of bonds
    int4*           pBondID;                        // Bond atom and output buffer IDs
    float2*         pBondParameter;                 // Bond parameters

--- a/platforms/cuda/src/kernels/gpu.cpp
+++ b/platforms/cuda/src/kernels/gpu.cpp
@@ -786,6 +786,10 @@ void gpuSetPMEParameters(gpuContext gpu, float alpha)
    gpu->sim.pPmeParticleFraction = gpu->psPmeParticleFraction->_pDevData;
    gpu->psPmeInteractionFlags = new CUDAStream<int>(totalGroups*(gpu->sim.paddedNumberOfAtoms/32), 1, "PmeInteractionFlags");
    gpu->sim.pPmeInteractionFlags = gpu->psPmeInteractionFlags->_pDevData;
+    gpu->psPmeAtomRange = new CUDAStream<int>(gridSize.x*gridSize.y*gridSize.z+1, 1, "PmeAtomRange");
+    gpu->sim.pPmeAtomRange = gpu->psPmeAtomRange->_pDevData;
+    gpu->psPmeAtomGridIndex = new CUDAStream<float2>(gpu->natoms, 1, "PmeAtomGridIndex");
+    gpu->sim.pPmeAtomGridIndex = gpu->psPmeAtomGridIndex->_pDevData;

    // Initialize the b-spline moduli.

@@ -1686,6 +1690,8 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
    gpu->psPmeBsplineDtheta         = NULL;
    gpu->psPmeParticleIndex         = NULL;
    gpu->psPmeParticleFraction      = NULL;
+    gpu->psPmeAtomRange             = NULL;
+    gpu->psPmeAtomGridIndex         = NULL;
    gpu->psShakeID                  = NULL;
    gpu->psShakeParameter           = NULL;
    gpu->psSettleID                 = NULL;
@@ -1853,6 +1859,8 @@ void gpuShutDown(gpuContext gpu)
        delete gpu->psPmeBsplineDtheta;
        delete gpu->psPmeParticleIndex;
        delete gpu->psPmeParticleFraction;
+        delete gpu->psPmeAtomRange;
+        delete gpu->psPmeAtomGridIndex;
        cufftDestroy(gpu->fftplan);
    }
    delete gpu->psObcData;

--- a/platforms/cuda/src/kernels/gputypes.h
+++ b/platforms/cuda/src/kernels/gputypes.h
@@ -113,6 +113,8 @@ struct _gpuContext {
    CUDAStream<int4>* psPmeParticleIndex;      // The grid indices for each atom
    CUDAStream<float4>* psPmeParticleFraction; // Fractional offset in the grid for each atom in all three dimensions.
    CUDAStream<int>* psPmeInteractionFlags;    // Flags for which groups of grid points interact with which atoms
+    CUDAStream<int>* psPmeAtomRange;           // The range of sorted atoms at each grid point
+    CUDAStream<float2>* psPmeAtomGridIndex;    // The grid point each atom is at
    CUDAStream<float2>* psObcData;
    CUDAStream<float>* psObcChain;
    CUDAStream<float>* psBornForce;

--- a/platforms/cuda/src/kernels/kCalculatePME.cu
+++ b/platforms/cuda/src/kernels/kCalculatePME.cu
@@ -25,6 +25,7 @@
 * -------------------------------------------------------------------------- */

 #include "gputypes.h"
+#include "bbsort.h"
 #include <cuda.h>

 using namespace std;
@@ -106,48 +107,86 @@ __global__ void kUpdateGridIndexAndFraction_kernel()
                              fast_mod(__float2int_rd(tix.y), cSim.pmeGridSize.y),
                              fast_mod(__float2int_rd(tix.z), cSim.pmeGridSize.z), 0);
        cSim.pPmeParticleIndex[i] = itmp;
+        cSim.pPmeAtomGridIndex[i] = make_float2(i, itmp.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+itmp.y*cSim.pmeGridSize.z+itmp.z);
    }
+//
+//    // Compute flags for which atoms affect which groups of grid points.
+//
+//    const int3 numGroups = make_int3((cSim.pmeGridSize.x+cSim.pmeGroupSize.x-1)/cSim.pmeGroupSize.x, (cSim.pmeGridSize.y+cSim.pmeGroupSize.y-1)/cSim.pmeGroupSize.y, (cSim.pmeGridSize.z+cSim.pmeGroupSize.z-1)/cSim.pmeGroupSize.z);
+//    const unsigned int totalGroups = numGroups.x*numGroups.y*numGroups.z;
+//    const float3 gridScale = make_float3(cSim.pmeGridSize.x/cSim.periodicBoxSizeX, cSim.pmeGridSize.y/cSim.periodicBoxSizeY, cSim.pmeGridSize.z/cSim.periodicBoxSizeZ);
+//    for (int group = tid; group < totalGroups; group += tnb)
+//    {
+//        int3 gridBase;
+//        gridBase.x = group/(numGroups.y*numGroups.z);
+//        int remainder = group-gridBase.x*numGroups.y*numGroups.z;
+//        gridBase.y = remainder/numGroups.z;
+//        gridBase.z = remainder-gridBase.y*numGroups.z;
+//        gridBase.x *= cSim.pmeGroupSize.x;
+//        gridBase.y *= cSim.pmeGroupSize.y;
+//        gridBase.z *= cSim.pmeGroupSize.z;
+//        unsigned int flags = 0;
+//        unsigned int baseIndex = group*(cSim.paddedNumberOfAtoms/32);
+//        for (int atomBlock = 0; atomBlock < cSim.paddedNumberOfAtoms>>GRIDBITS; atomBlock++)
+//        {
+//            // Decide if this block actually needs to be processed.
+//
+//            int flagIndex = atomBlock%32;
+//            if (flagIndex == 0)
+//                flags = 0;
+//            float4 boxSize = cSim.pGridBoundingBox[atomBlock];
+//            float4 center = cSim.pGridCenter[atomBlock];
+//            int maxx = (int) ceil((center.x+boxSize.x)*gridScale.x)+cSim.pmeGroupSize.x+PME_ORDER;
+//            int maxy = (int) ceil((center.y+boxSize.y)*gridScale.y)+cSim.pmeGroupSize.y+PME_ORDER;
+//            int maxz = (int) ceil((center.z+boxSize.z)*gridScale.z)+cSim.pmeGroupSize.z+PME_ORDER;
+//            int minx = (int) floor((center.x-boxSize.x)*gridScale.x);
+//            int miny = (int) floor((center.y-boxSize.y)*gridScale.y);
+//            int minz = (int) floor((center.z-boxSize.z)*gridScale.z);
+//            int x = minx+(gridBase.x-minx)%cSim.pmeGridSize.x;
+//            int y = miny+(gridBase.y-miny)%cSim.pmeGridSize.y;
+//            int z = minz+(gridBase.z-minz)%cSim.pmeGridSize.z;
+//            if (maxx < x || maxy < y || maxz < z)
+//                flags += 1<<flagIndex;
+//            if (flagIndex == 31 || atomBlock == cSim.paddedNumberOfAtoms>>GRIDBITS)
+//                cSim.pPmeInteractionFlags[baseIndex+atomBlock/32] = flags;
+//        }
+//    }
+}

-    // Compute flags for which atoms affect which groups of grid points.
+/**
+ * For each grid point, find the range of sorted atoms associated with that point.
+ */

-    const int3 numGroups = make_int3((cSim.pmeGridSize.x+cSim.pmeGroupSize.x-1)/cSim.pmeGroupSize.x, (cSim.pmeGridSize.y+cSim.pmeGroupSize.y-1)/cSim.pmeGroupSize.y, (cSim.pmeGridSize.z+cSim.pmeGroupSize.z-1)/cSim.pmeGroupSize.z);
-    const unsigned int totalGroups = numGroups.x*numGroups.y*numGroups.z;
-    const float3 gridScale = make_float3(cSim.pmeGridSize.x/cSim.periodicBoxSizeX, cSim.pmeGridSize.y/cSim.periodicBoxSizeY, cSim.pmeGridSize.z/cSim.periodicBoxSizeZ);
-    for (int group = tid; group < totalGroups; group += tnb)
+__global__ void kFindAtomRangeForGrid_kernel()
+{
+    int thread = blockIdx.x*blockDim.x+threadIdx.x;
+    int start = (cSim.atoms*thread)/(blockDim.x*gridDim.x);
+    int end = (cSim.atoms*(thread+1))/(blockDim.x*gridDim.x);
+    int last = (thread == 0 ? -1 : cSim.pPmeAtomGridIndex[start-1].y);
+    for (int i = start; i < end; ++i)
    {
-        int3 gridBase;
-        gridBase.x = group/(numGroups.y*numGroups.z);
-        int remainder = group-gridBase.x*numGroups.y*numGroups.z;
-        gridBase.y = remainder/numGroups.z;
-        gridBase.z = remainder-gridBase.y*numGroups.z;
-        gridBase.x *= cSim.pmeGroupSize.x;
-        gridBase.y *= cSim.pmeGroupSize.y;
-        gridBase.z *= cSim.pmeGroupSize.z;
-        unsigned int flags = 0;
-        unsigned int baseIndex = group*(cSim.paddedNumberOfAtoms/32);
-        for (int atomBlock = 0; atomBlock < cSim.paddedNumberOfAtoms>>GRIDBITS; atomBlock++)
+        float2 atomData = cSim.pPmeAtomGridIndex[i];
+        int gridIndex = atomData.y;
+        if (gridIndex != last)
        {
-            // Decide if this block actually needs to be processed.
-
-            int flagIndex = atomBlock%32;
-            if (flagIndex == 0)
-                flags = 0;
-            float4 boxSize = cSim.pGridBoundingBox[atomBlock];
-            float4 center = cSim.pGridCenter[atomBlock];
-            int maxx = (int) ceil((center.x+boxSize.x)*gridScale.x)+cSim.pmeGroupSize.x+PME_ORDER;
-            int maxy = (int) ceil((center.y+boxSize.y)*gridScale.y)+cSim.pmeGroupSize.y+PME_ORDER;
-            int maxz = (int) ceil((center.z+boxSize.z)*gridScale.z)+cSim.pmeGroupSize.z+PME_ORDER;
-            int minx = (int) floor((center.x-boxSize.x)*gridScale.x);
-            int miny = (int) floor((center.y-boxSize.y)*gridScale.y);
-            int minz = (int) floor((center.z-boxSize.z)*gridScale.z);
-            int x = minx+(gridBase.x-minx)%cSim.pmeGridSize.x;
-            int y = miny+(gridBase.y-miny)%cSim.pmeGridSize.y;
-            int z = minz+(gridBase.z-minz)%cSim.pmeGridSize.z;
-            if (maxx < x || maxy < y || maxz < z)
-                flags += 1<<flagIndex;
-            if (flagIndex == 31 || atomBlock == cSim.paddedNumberOfAtoms>>GRIDBITS)
-                cSim.pPmeInteractionFlags[baseIndex+atomBlock/32] = flags;
+            for (int j = last+1; j <= gridIndex; ++j)
+                cSim.pPmeAtomRange[j] = i;
+            last = gridIndex;
+        }
+
+        // The grid index won't be needed again.  Reuse that component to hold the atom charge, thus saving
+        // an extra load operation in the charge spreading kernel.
+
+        cSim.pPmeAtomGridIndex[i].y = cSim.pPosq[(int) atomData.x].w;
    }
+
+    // Fill in values beyond the last atom.
+    
+    if (thread == blockDim.x*gridDim.x-1)
+    {
+        int gridSize = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
+        for (int j = last+1; j <= gridSize; ++j)
+            cSim.pPmeAtomRange[j] = cSim.atoms;
    }
 }

@@ -217,79 +256,116 @@ __global__ void kUpdateBsplines_kernel()
    }
 }

+//__global__ void kGridSpreadCharge_kernel()
+//{
+//    extern __shared__ float atomCharge[];
+//    int4* atomGridIndex = (int4*) &atomCharge[blockDim.x];
+//    const unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
+//    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
+//    const int3 numGroups = make_int3((cSim.pmeGridSize.x+cSim.pmeGroupSize.x-1)/cSim.pmeGroupSize.x, (cSim.pmeGridSize.y+cSim.pmeGroupSize.y-1)/cSim.pmeGroupSize.y, (cSim.pmeGridSize.z+cSim.pmeGroupSize.z-1)/cSim.pmeGroupSize.z);
+//    const unsigned int totalGroups = numGroups.x*numGroups.y*numGroups.z;
+//    unsigned int group = warp*totalGroups/totalWarps;
+//    const unsigned int end = (warp+1)*totalGroups/totalWarps;
+//    const unsigned int index = threadIdx.x & (GRID - 1);
+//
+//    while (group < end)
+//    {
+//        // Process a group of grid points of size cSim.pmeGroupSize.  First figure out the base index for the group,
+//        // and the index of the specific point this thread will handle.
+//
+//        int3 gridBase;
+//        gridBase.x = group/(numGroups.y*numGroups.z);
+//        int remainder = group-gridBase.x*numGroups.y*numGroups.z;
+//        gridBase.y = remainder/numGroups.z;
+//        gridBase.z = remainder-gridBase.y*numGroups.z;
+//        gridBase.x *= cSim.pmeGroupSize.x;
+//        gridBase.y *= cSim.pmeGroupSize.y;
+//        gridBase.z *= cSim.pmeGroupSize.z;
+//        int3 gridPoint;
+//        gridPoint.x = index/(cSim.pmeGroupSize.y*cSim.pmeGroupSize.z);
+//        remainder = index-gridPoint.x*cSim.pmeGroupSize.y*cSim.pmeGroupSize.z;
+//        gridPoint.y = remainder/cSim.pmeGroupSize.z;
+//        gridPoint.z = remainder-gridPoint.y*cSim.pmeGroupSize.z;
+//        gridPoint.x += gridBase.x;
+//        gridPoint.y += gridBase.y;
+//        gridPoint.z += gridBase.z;
+//
+//        // Loop over blocks of atoms.
+//
+//        float result = 0.0f;
+//        int flags = 0;
+//        unsigned int baseIndex = group*(cSim.paddedNumberOfAtoms/32);
+//        for (int atomBlock = 0; atomBlock < cSim.paddedNumberOfAtoms>>GRIDBITS; atomBlock++)
+//        {
+//            // Decide if this block actually needs to be processed.
+//
+//            int flagIndex = atomBlock%32;
+//            if (flagIndex == 0)
+//                flags = cSim.pPmeInteractionFlags[baseIndex+atomBlock/32];
+//            if ((flags & (1<<flagIndex)) != 0)
+//                continue;
+//            int atomIndex = (atomBlock<<GRIDBITS)+index;
+//            if (atomIndex < cSim.atoms)
+//            {
+//                atomCharge[threadIdx.x] = cSim.pPosq[atomIndex].w;
+//                atomGridIndex[threadIdx.x] = cSim.pPmeParticleIndex[atomIndex];
+//            }
+//            int maxAtoms = min(GRID, cSim.atoms-(atomBlock<<GRIDBITS));
+//            for (int i = 0; i < maxAtoms; i++)
+//            {
+//                int localIndex = threadIdx.x-index+i;
+//                int atomIndex = (atomBlock<<GRIDBITS)+i;
+//                int ix = gridPoint.x-atomGridIndex[localIndex].x;
+//                int iy = gridPoint.y-atomGridIndex[localIndex].y;
+//                int iz = gridPoint.z-atomGridIndex[localIndex].z;
+//                ix += (ix < 0 ? cSim.pmeGridSize.x : 0);
+//                iy += (iy < 0 ? cSim.pmeGridSize.y : 0);
+//                iz += (iz < 0 ? cSim.pmeGridSize.z : 0);
+//                if (ix < PME_ORDER && iy < PME_ORDER && iz < PME_ORDER)
+//                    result += atomCharge[threadIdx.x-index+i]*cSim.pPmeBsplineTheta[atomIndex+ix*cSim.atoms].x*cSim.pPmeBsplineTheta[atomIndex+iy*cSim.atoms].y*cSim.pPmeBsplineTheta[atomIndex+iz*cSim.atoms].z;
+//            }
+//        }
+//        unsigned int gridIndex = gridPoint.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+gridPoint.y*cSim.pmeGridSize.z+gridPoint.z;
+//        if (gridIndex < cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z)
+//            cSim.pPmeGrid[gridIndex] = make_cuComplex(result*sqrt(cSim.epsfac), 0.0f);
+//        group++;
+//    }
+//}
+
 __global__ void kGridSpreadCharge_kernel()
 {
-    extern __shared__ float atomCharge[];
-    int4* atomGridIndex = (int4*) &atomCharge[blockDim.x];
-    const unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
-    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
-    const int3 numGroups = make_int3((cSim.pmeGridSize.x+cSim.pmeGroupSize.x-1)/cSim.pmeGroupSize.x, (cSim.pmeGridSize.y+cSim.pmeGroupSize.y-1)/cSim.pmeGroupSize.y, (cSim.pmeGridSize.z+cSim.pmeGroupSize.z-1)/cSim.pmeGroupSize.z);
-    const unsigned int totalGroups = numGroups.x*numGroups.y*numGroups.z;
-    unsigned int group = warp*totalGroups/totalWarps;
-    const unsigned int end = (warp+1)*totalGroups/totalWarps;
-    const unsigned int index = threadIdx.x & (GRID - 1);
-
-    while (group < end)
+    unsigned int numGridPoints = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
+    unsigned int numThreads = gridDim.x*blockDim.x;
+    for (int gridIndex = blockIdx.x*blockDim.x+threadIdx.x; gridIndex < numGridPoints; gridIndex += numThreads)
    {
-        // Process a group of grid points of size cSim.pmeGroupSize.  First figure out the base index for the group,
-        // and the index of the specific point this thread will handle.
-
-        int3 gridBase;
-        gridBase.x = group/(numGroups.y*numGroups.z);
-        int remainder = group-gridBase.x*numGroups.y*numGroups.z;
-        gridBase.y = remainder/numGroups.z;
-        gridBase.z = remainder-gridBase.y*numGroups.z;
-        gridBase.x *= cSim.pmeGroupSize.x;
-        gridBase.y *= cSim.pmeGroupSize.y;
-        gridBase.z *= cSim.pmeGroupSize.z;
        int3 gridPoint;
-        gridPoint.x = index/(cSim.pmeGroupSize.y*cSim.pmeGroupSize.z);
-        remainder = index-gridPoint.x*cSim.pmeGroupSize.y*cSim.pmeGroupSize.z;
-        gridPoint.y = remainder/cSim.pmeGroupSize.z;
-        gridPoint.z = remainder-gridPoint.y*cSim.pmeGroupSize.z;
-        gridPoint.x += gridBase.x;
-        gridPoint.y += gridBase.y;
-        gridPoint.z += gridBase.z;
-
-        // Loop over blocks of atoms.
-
+        gridPoint.x = gridIndex/(cSim.pmeGridSize.y*cSim.pmeGridSize.z);
+        int remainder = gridIndex-gridPoint.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
+        gridPoint.y = remainder/cSim.pmeGridSize.z;
+        gridPoint.z = remainder-gridPoint.y*cSim.pmeGridSize.z;
+        gridPoint.x += cSim.pmeGridSize.x;
+        gridPoint.y += cSim.pmeGridSize.y;
+        gridPoint.z += cSim.pmeGridSize.z;
        float result = 0.0f;
-        int flags = 0;
-        unsigned int baseIndex = group*(cSim.paddedNumberOfAtoms/32);
-        for (int atomBlock = 0; atomBlock < cSim.paddedNumberOfAtoms>>GRIDBITS; atomBlock++)
+        for (int ix = 0; ix < PME_ORDER; ++ix)
+            for (int iy = 0; iy < PME_ORDER; ++iy)
+                for (int iz = 0; iz < PME_ORDER; ++iz)
                {
-            // Decide if this block actually needs to be processed.
-
-            int flagIndex = atomBlock%32;
-            if (flagIndex == 0)
-                flags = cSim.pPmeInteractionFlags[baseIndex+atomBlock/32];
-            if ((flags & (1<<flagIndex)) != 0)
-                continue;
-            int atomIndex = (atomBlock<<GRIDBITS)+index;
-            if (atomIndex < cSim.atoms)
-            {
-                atomCharge[threadIdx.x] = cSim.pPosq[atomIndex].w;
-                atomGridIndex[threadIdx.x] = cSim.pPmeParticleIndex[atomIndex];
-            }
-            int maxAtoms = min(GRID, cSim.atoms-(atomBlock<<GRIDBITS));
-            for (int i = 0; i < maxAtoms; i++)
+                    int x = (gridPoint.x-ix)%cSim.pmeGridSize.x;
+                    int y = (gridPoint.y-iy)%cSim.pmeGridSize.y;
+                    int z = (gridPoint.z-iz)%cSim.pmeGridSize.z;
+                    int gridIndex = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+z;
+                    int firstAtom = cSim.pPmeAtomRange[gridIndex];
+                    int lastAtom = cSim.pPmeAtomRange[gridIndex+1];
+                    for (int i = firstAtom; i < lastAtom; ++i)
                    {
-                int localIndex = threadIdx.x-index+i;
-                int atomIndex = (atomBlock<<GRIDBITS)+i;
-                int ix = gridPoint.x-atomGridIndex[localIndex].x;
-                int iy = gridPoint.y-atomGridIndex[localIndex].y;
-                int iz = gridPoint.z-atomGridIndex[localIndex].z;
-                ix += (ix < 0 ? cSim.pmeGridSize.x : 0);
-                iy += (iy < 0 ? cSim.pmeGridSize.y : 0);
-                iz += (iz < 0 ? cSim.pmeGridSize.z : 0);
-                if (ix < PME_ORDER && iy < PME_ORDER && iz < PME_ORDER)
-                    result += atomCharge[threadIdx.x-index+i]*cSim.pPmeBsplineTheta[atomIndex+ix*cSim.atoms].x*cSim.pPmeBsplineTheta[atomIndex+iy*cSim.atoms].y*cSim.pPmeBsplineTheta[atomIndex+iz*cSim.atoms].z;
+                        float2 atomData = cSim.pPmeAtomGridIndex[i];
+                        int atomIndex = atomData.x;
+                        float atomCharge = atomData.y;
+                        result += atomCharge*cSim.pPmeBsplineTheta[atomIndex+ix*cSim.atoms].x*cSim.pPmeBsplineTheta[atomIndex+iy*cSim.atoms].y*cSim.pPmeBsplineTheta[atomIndex+iz*cSim.atoms].z;
                    }
                }
-        unsigned int gridIndex = gridPoint.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+gridPoint.y*cSim.pmeGridSize.z+gridPoint.z;
-        if (gridIndex < cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z)
        cSim.pPmeGrid[gridIndex] = make_cuComplex(result*sqrt(cSim.epsfac), 0.0f);
-        group++;
    }
 }

@@ -370,11 +446,17 @@ void kCalculatePME(gpuContext gpu)
 //    printf("kCalculatePME\n");
    kUpdateGridIndexAndFraction_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
    LAUNCHERROR("kUpdateGridIndexAndFraction");
+    bbSort(gpu->psPmeAtomGridIndex->_pDevData, gpu->natoms);
+    kFindAtomRangeForGrid_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
+    LAUNCHERROR("kFindAtomRangeForGrid");
    unsigned int threads = 16380/(2*PME_ORDER*sizeof(float4));
    kUpdateBsplines_kernel<<<gpu->sim.blocks, threads, 2*threads*PME_ORDER*sizeof(float4)>>>();
    LAUNCHERROR("kUpdateBsplines");
    kGridSpreadCharge_kernel<<<gpu->sim.blocks, 64, 64*(sizeof(float)+sizeof(int4))>>>();
    LAUNCHERROR("kGridSpreadCharge");
+//    gpu->psPmeGrid->Download();
+//    for (int i = 0; i < gpu->psPmeGrid->_length; i += 100)
+//        printf("%d %f %f\n", i, (*gpu->psPmeGrid)[i].x, (*gpu->psPmeGrid)[i].y);
    cufftExecC2C(gpu->fftplan, gpu->psPmeGrid->_pDevData, gpu->psPmeGrid->_pDevData, CUFFT_FORWARD);
    kReciprocalConvolution_kernel<<<gpu->sim.blocks, gpu->sim.nonbond_threads_per_block>>>();
    LAUNCHERROR("kReciprocalConvolution");