Unverified Commit 3b8df952 authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Merge pull request #4632 from ex-rzr/make-hip-standard-platform

HIP platform
parents 5ce6a85d 28fb2918
/**
* This file contains HIP definitions for the macros and functions needed for the
* common compute framework.
*/
#define KERNEL extern "C" __global__
#define DEVICE __device__
#define LOCAL __shared__
#define LOCAL_ARG
#define GLOBAL
#define RESTRICT __restrict__
#define LOCAL_ID threadIdx.x
#define LOCAL_SIZE blockDim.x
#define GLOBAL_ID (blockIdx.x*blockDim.x+threadIdx.x)
#define GLOBAL_SIZE (blockDim.x*gridDim.x)
#define GROUP_ID blockIdx.x
#define NUM_GROUPS gridDim.x
#define SYNC_THREADS __syncthreads();
#define SYNC_WARPS {__builtin_amdgcn_wave_barrier(); __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "wavefront");}
#define MEM_FENCE __threadfence_block();
#define ATOMIC_ADD(dest, value) atomicAdd(dest, value)
typedef long long mm_long;
typedef unsigned long long mm_ulong;
#define SUPPORTS_DOUBLE_PRECISION 1
#define LAUNCH_BOUNDS_EXACT(WORK_GROUP_SIZE, WAVES_PER_EU) \
__attribute__((amdgpu_flat_work_group_size(WORK_GROUP_SIZE, WORK_GROUP_SIZE), amdgpu_waves_per_eu(WAVES_PER_EU, WAVES_PER_EU)))
#ifdef USE_DOUBLE_PRECISION
__device__ inline long long realToFixedPoint(double x) {
return static_cast<long long>(x * 0x100000000);
}
#else
__device__ inline long long realToFixedPoint(float x) {
// Faster way to calculate static_cast<long long>(x * 0x100000000) with exactly the same
// results but less instructions.
float integral = truncf(x);
float fractional = (x - integral) * 0x100000000;
unsigned int integral_u32 = static_cast<int>(integral);
unsigned int fractional_u32 = static_cast<unsigned int>(fabsf(fractional));
// A negative real number (with non-zero fractional) needs rounding-down x for integral and
// changing fractional's sign. However, -1 is used as a threshold instead of 0 because, when
// fractional is in (-1; 0], fractional_u32 is 0 and the number is considered an integer.
bool isNegReal = fractional <= -1.0f;
return (static_cast<unsigned long long>(isNegReal ? integral_u32 - 1 : integral_u32) << 32) |
static_cast<unsigned long long>(isNegReal ? 0 - fractional_u32 : fractional_u32);
}
#endif
#define BUFFER_SIZE 256
#if defined(AMD_RDNA)
typedef unsigned int warpflags;
__device__ inline int warpPopc(warpflags x) {
return __popc(x);
}
#else
typedef unsigned long long warpflags;
__device__ inline int warpPopc(warpflags x) {
return __popcll(x);
}
#endif
struct alignas(sizeof(__half) * 4) BoundingBox {
__device__ BoundingBox(real3 f) {
// Round up so we'll err on the side of making the box a little too large.
// This ensures interactions will never be missed.
v[0] = __float2half_ru((float) f.x);
v[1] = __float2half_ru((float) f.y);
v[2] = __float2half_ru((float) f.z);
}
__device__ real3 toReal3() const {
return make_real3(__half2float(v[0]), __half2float(v[1]), __half2float(v[2]));
}
private:
__half v[3];
};
/**
* Find a bounding box for the atoms in each block.
*/
extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ rebuildNeighborList,
real2* __restrict__ blockSizeRange) {
const int indexInTile = threadIdx.x%TILE_SIZE;
const int index = warpSize == TILE_SIZE ? blockIdx.x : (blockIdx.x*(warpSize/TILE_SIZE) + threadIdx.x/TILE_SIZE);
const int base = index * TILE_SIZE;
if (base >= numAtoms)
return;
real4 tPos = posq[base + indexInTile < numAtoms ? base + indexInTile : 0];
#ifdef USE_PERIODIC
real4 pos;
pos.x = SHFL(tPos.x, 0);
pos.y = SHFL(tPos.y, 0);
pos.z = SHFL(tPos.z, 0);
APPLY_PERIODIC_TO_POS(pos)
real4 minPos = pos;
real4 maxPos = pos;
for (int i = 1; i < TILE_SIZE; i++) {
pos.x = SHFL(tPos.x, i);
pos.y = SHFL(tPos.y, i);
pos.z = SHFL(tPos.z, i);
real4 center = 0.5f*(maxPos+minPos);
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
}
#else
real4 minPos = tPos;
real4 maxPos = tPos;
for (int i = TILE_SIZE >> 1; i > 0; i >>= 1) {
real4 tpos1, tpos2;
tpos1.x = __shfl_down(minPos.x, i, TILE_SIZE);
tpos1.y = __shfl_down(minPos.y, i, TILE_SIZE);
tpos1.z = __shfl_down(minPos.z, i, TILE_SIZE);
tpos2.x = __shfl_down(maxPos.x, i, TILE_SIZE);
tpos2.y = __shfl_down(maxPos.y, i, TILE_SIZE);
tpos2.z = __shfl_down(maxPos.z, i, TILE_SIZE);
minPos.x = min(minPos.x, tpos1.x);
minPos.y = min(minPos.y, tpos1.y);
minPos.z = min(minPos.z, tpos1.z);
maxPos.x = max(maxPos.x, tpos2.x);
maxPos.y = max(maxPos.y, tpos2.y);
maxPos.z = max(maxPos.z, tpos2.z);
}
minPos.x = SHFL(minPos.x, 0);
minPos.y = SHFL(minPos.y, 0);
minPos.z = SHFL(minPos.z, 0);
maxPos.x = SHFL(maxPos.x, 0);
maxPos.y = SHFL(maxPos.y, 0);
maxPos.z = SHFL(maxPos.z, 0);
#endif
real4 blockSize = 0.5f*(maxPos-minPos);
real4 center = 0.5f*(maxPos+minPos);
center.w = 0;
real4 delta = tPos - center;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real tdelta = delta.x*delta.x+delta.y*delta.y+delta.z*delta.z;
real tcenter = max(center.w, tdelta);
for (int i = TILE_SIZE >> 1; i > 0; i >>= 1) {
real t = __shfl_down(tcenter, i, TILE_SIZE);
tcenter = max(tcenter, t);
}
if (indexInTile == 0) {
center.w = SQRT(tcenter);
blockBoundingBox[index] = blockSize;
blockCenter[index] = center;
// Record the range of sizes.
real totalSize = blockSize.x+blockSize.y+blockSize.z;
atomicMin(&blockSizeRange->x, totalSize);
atomicMax(&blockSizeRange->y, totalSize);
}
if (blockIdx.x == 0 && threadIdx.x == 0)
rebuildNeighborList[0] = 0;
}
extern "C" __global__ void computeSortKeys(const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ sortedBlocks, const real2* __restrict__ blockSizeRange/*, int numSizes*/) {
// Sort keys store the bin in the high order part and the block in the low
// order part.
real2 sizeRange = make_real2(LOG(blockSizeRange->x), LOG(blockSizeRange->y));
int numSizeBins = 20;
real scale = numSizeBins/(sizeRange.y-sizeRange.x);
unsigned int i = threadIdx.x+blockIdx.x*blockDim.x;
if (i < NUM_BLOCKS) {
real4 box = blockBoundingBox[i];
real size = LOG(box.x+box.y+box.z);
int bin = (size-sizeRange.x)*scale;
bin = max(0, min(bin, numSizeBins-1));
sortedBlocks[i] = (((unsigned int) bin)<<BIN_SHIFT) + i;
}
}
/**
* Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
*/
extern "C" __global__ void sortBoxData(const unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter, BoundingBox* __restrict__ sortedBlockBoundingBox,
#ifdef USE_LARGE_BLOCKS
real4* __restrict__ largeBlockCenter, BoundingBox* __restrict__ largeBlockBoundingBox, real4 periodicBoxSize,
real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
#endif
const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild,
real2* __restrict__ blockSizeRange) {
int i = threadIdx.x+blockIdx.x*blockDim.x;
if (i < NUM_BLOCKS) {
unsigned int index = sortedBlocks[i] & BLOCK_INDEX_MASK;
sortedBlockCenter[i] = blockCenter[index];
sortedBlockBoundingBox[i] = BoundingBox(trimTo3(blockBoundingBox[index]));
#ifdef USE_LARGE_BLOCKS
// Compute the sizes of large blocks (composed of warpSize regular blocks) starting from each block.
real4 minPos = blockCenter[index]-blockBoundingBox[index];
real4 maxPos = blockCenter[index]+blockBoundingBox[index];
int last = min(i+warpSize, NUM_BLOCKS);
for (int j = i+1; j < last; j++) {
unsigned int index2 = sortedBlocks[j] & BLOCK_INDEX_MASK;
real4 blockPos = blockCenter[index2];
real4 width = blockBoundingBox[index2];
#ifdef USE_PERIODIC
real4 center = 0.5f*(maxPos+minPos);
APPLY_PERIODIC_TO_POS_WITH_CENTER(blockPos, center)
#endif
minPos = make_real4(min(minPos.x, blockPos.x-width.x), min(minPos.y, blockPos.y-width.y), min(minPos.z, blockPos.z-width.z), 0);
maxPos = make_real4(max(maxPos.x, blockPos.x+width.x), max(maxPos.y, blockPos.y+width.y), max(maxPos.z, blockPos.z+width.z), 0);
}
largeBlockCenter[i] = 0.5f*(maxPos+minPos);
largeBlockBoundingBox[i] = BoundingBox(trimTo3(0.5f*(maxPos-minPos)));
#endif
}
// Initialize min and max for the atomic reduction in findBlockBounds (for the next step)
if (i == 0) {
blockSizeRange->x = 1e38;
blockSizeRange->y = 0;
}
// Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
bool rebuild = forceRebuild;
if (i < NUM_ATOMS) {
real4 delta = oldPositions[i]-posq[i];
if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)
rebuild = true;
}
if (rebuild) {
rebuildNeighborList[0] = 1;
interactionCount[0] = 0;
interactionCount[1] = 0;
}
}
#if MAX_BITS_FOR_PAIRS > 0
__device__ inline
void collectInteractions(unsigned int& interacts, float d, int bit) {
interacts |= (__float_as_uint(d) >> 31) << bit;
}
__device__ inline
void collectInteractions(unsigned int& interacts, double d, int bit) {
interacts |= ((unsigned int)__double2hiint(d) >> 31) << bit;
}
#else
// Simplified version that does not collect individual bits (they are not needed without single pairs),
// only sets the flag that there are any interactions.
__device__ inline
void collectInteractions(unsigned int& interacts, float d, int) {
interacts |= __float_as_uint(d) & (1 << 31);
}
__device__ inline
void collectInteractions(unsigned int& interacts, double d, int) {
interacts |= (unsigned int)__double2hiint(d) & (1 << 31);
}
#endif
#if !defined(USE_DOUBLE_PRECISION) && __has_builtin(__builtin_amdgcn_mfma_f32_4x4x1f32)
#define USE_MFMA
using vfloat = __attribute__((__vector_size__(4 * sizeof(float)))) float;
template<int BlockId>
inline __device__
void mfma4x4(const float4& pos1, const float4& pos2, const vfloat& c, unsigned int& interacts) {
vfloat d;
d = __builtin_amdgcn_mfma_f32_4x4x1f32(pos1.x, -pos2.x, c, 4, BlockId, 0);
d = __builtin_amdgcn_mfma_f32_4x4x1f32(pos1.y, -pos2.y, d, 4, BlockId, 0);
d = __builtin_amdgcn_mfma_f32_4x4x1f32(pos1.z, -pos2.z, d, 4, BlockId, 0);
d = __builtin_amdgcn_mfma_f32_4x4x1f32(pos1.w, 1.0f, d, 4, BlockId, 0);
#pragma unroll
for (int i = 0; i < 4; i++) {
collectInteractions(interacts, d[i], BlockId * 4 + i);
}
}
#endif
/**
* Compare the bounding boxes for each pair of atom blocks (comprised of TILE_SIZE atoms each), forming a tile. If the two
* atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
*
* STAGE 1:
*
* A coarse grained atom block against interacting atom block neighbour list is constructed.
*
* Each warp first loads in some block X of interest. Each thread within the warp then loads
* in a different atom block Y. If Y has exclusions with X, then Y is not processed. If the bounding boxes
* of the two atom blocks are within the cutoff distance, then the two atom blocks are considered to be
* interacting and Y is added to the buffer for X.
*
* STAGE 2:
*
* A fine grained atom block against interacting atoms neighbour list is constructed.
*
* The warp loops over atom blocks Y that were found to (possibly) interact with atom block X. Each thread
* in the warp loops over the TILE_SIZE atoms in X and compares their positions to one particular atom from block Y.
* If it finds one closer than the cutoff distance, the atom is added to the list of atoms interacting with block X.
* This continues until the buffer fills up, at which point the results are written to global memory.
*
* [in] periodicBoxSize - size of the rectangular periodic box
* [in] invPeriodicBoxSize - inverse of the periodic box
* [in] blockCenter - the center of each bounding box
* [in] blockBoundingBox - bounding box of each atom block
* [out] interactionCount - total number of tiles that have interactions
* [out] interactingTiles - set of blocks that have interactions
* [out] interactingAtoms - a list of atoms that interact with each atom block
* [in] posq - x,y,z coordinates of each atom and charge q
* [in] maxTiles - maximum number of tiles to process, used for multi-GPUs
* [in] startBlockIndex - first block to process, used for multi-GPUs,
* [in] numBlocks - total number of atom blocks
* [in] sortedBlocks - a sorted list of atom blocks based on volume
* [in] sortedBlockCenter - sorted centers, duplicated for fast access to avoid indexing
* [in] sortedBlockBoundingBox - sorted bounding boxes, duplicated for fast access
* [in] exclusionIndices - maps into exclusionRowIndices with the starting position for a given atom
* [in] exclusionRowIndices - stores the a continuous list of exclusions
* eg: block 0 is excluded from atom 3,5,6
* block 1 is excluded from atom 3,4
* block 2 is excluded from atom 1,3,5,6
* exclusionIndices[0][3][5][8]
* exclusionRowIndices[3][5][6][3][4][1][3][5][6]
* index 0 1 2 3 4 5 6 7 8
* [out] oldPos - stores the positions of the atoms in which this neighbourlist was built on
* - this is used to decide when to rebuild a neighbourlist
* [in] rebuildNeighbourList - whether or not to execute this kernel
*
*/
extern "C" __global__ __launch_bounds__(GROUP_SIZE) void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms,
int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs,
unsigned int startBlockIndex, unsigned int numBlocks, const unsigned int* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter,
const BoundingBox* __restrict__ sortedBlockBoundingBox,
#ifdef USE_LARGE_BLOCKS
const real4* __restrict__ largeBlockCenter, const BoundingBox* __restrict__ largeBlockBoundingBox,
#endif
const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) {
if (rebuildNeighborList[0] == 0)
return; // The neighbor list doesn't need to be rebuilt.
constexpr int tilesPerWarp = warpSize/TILE_SIZE;
constexpr int warpsPerBlock = GROUP_SIZE/warpSize;
const int indexInWarp = threadIdx.x%warpSize;
const int indexInTile = threadIdx.x%TILE_SIZE;
const int tileInWarp = tilesPerWarp == 1 ? 0 : (threadIdx.x/TILE_SIZE)%tilesPerWarp;
const int warpInBlock = warpsPerBlock == 1 ? 0 : threadIdx.x/warpSize;
const int warpIndex = blockIdx.x*warpsPerBlock + (warpsPerBlock == 1 ? 0 : warpInBlock);
const warpflags warpMask = (static_cast<warpflags>(1)<<indexInWarp)-1;
__shared__ int workgroupBuffer[BUFFER_SIZE*warpsPerBlock];
__shared__ real4 workgroupPosBuffer[TILE_SIZE*warpsPerBlock];
__shared__ int workgroupExclusions[MAX_EXCLUSIONS*warpsPerBlock];
__shared__ int workgroupBlock2Buffer[(warpSize+1)*warpsPerBlock];
int* buffer = workgroupBuffer+BUFFER_SIZE*warpInBlock;
real4* posBuffer = workgroupPosBuffer+TILE_SIZE*warpInBlock;
int* exclusionsForX = workgroupExclusions+MAX_EXCLUSIONS*warpInBlock;
int* block2Buffer = workgroupBlock2Buffer+(warpSize+1)*warpInBlock;
// Loop over blocks.
int block1 = startBlockIndex+warpIndex/NUM_TILES_IN_BATCH;
if (block1 < startBlockIndex+numBlocks) {
// Load data for this block. Note that all threads in a warp are processing the same block.
int x = sortedBlocks[block1] & BLOCK_INDEX_MASK;
real4 blockCenterX = sortedBlockCenter[block1];
real3 blockSizeX = sortedBlockBoundingBox[block1].toReal3();
int neighborsInBuffer = 0;
real4 pos1 = posq[x*TILE_SIZE+indexInTile];
#ifdef USE_PERIODIC
const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
}
#endif
pos1.w = 0.5f * (pos1.x * pos1.x + pos1.y * pos1.y + pos1.z * pos1.z);
if (tileInWarp == 0) {
posBuffer[indexInTile] = pos1;
}
// Load exclusion data for block x.
const int exclusionStart = exclusionRowIndices[x];
const int exclusionEnd = exclusionRowIndices[x+1];
const int numExclusions = exclusionEnd-exclusionStart;
#pragma unroll 1
for (int j = indexInWarp; j < numExclusions; j += warpSize)
exclusionsForX[j] = exclusionIndices[exclusionStart+j];
// Loop over atom blocks to search for neighbors. The threads in a warp compare block1 against warpSize
// other blocks in parallel.
// For small systems multiple warps (NUM_TILES_IN_BATCH = 4, 2...) process one block1 reducing the overall
// duration of the kernel because first blocks block1 have to process more block2 blocks so most of compute
// units are idle at the end of the kernel (the kernel works on the upper triangle of
// the NUM_BLOCKS x NUM_BLOCKS matrix).
#ifdef USE_LARGE_BLOCKS
warpflags largeBlockFlags = 0;
int loadedLargeBlocks = 0;
#endif
int block2Count = 0;
// Load blocks from addresses aligned by warpSize for faster loading from sortedBlockCenter and sortedBlockBoundingBox.
for (int block2Base = ((block1+1)/warpSize + warpIndex%NUM_TILES_IN_BATCH)*warpSize; block2Base < NUM_BLOCKS; block2Base += warpSize*NUM_TILES_IN_BATCH) {
// Last iteration cannot be skipped (on CDNA where tilesPerWarp == 2)
const bool lastIteration = block2Base + warpSize*NUM_TILES_IN_BATCH >= NUM_BLOCKS;
#ifdef USE_LARGE_BLOCKS
if (loadedLargeBlocks == 0) {
// Check the next set of large blocks.
int largeBlockIndex = block2Base + warpSize*NUM_TILES_IN_BATCH*indexInWarp;
bool includeLargeBlock = false;
if (largeBlockIndex < NUM_BLOCKS) {
real4 largeCenter = largeBlockCenter[largeBlockIndex];
real3 largeSize = largeBlockBoundingBox[largeBlockIndex].toReal3();
real4 blockDelta = blockCenterX-largeCenter;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif
blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-largeSize.x);
blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-largeSize.y);
blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-largeSize.z);
includeLargeBlock = (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
#ifdef TRICLINIC
// The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
// If there's any possibility we might have missed it, do a detailed check.
if (periodicBoxSize.z/2-blockSizeX.z-largeSize.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-largeSize.y < PADDED_CUTOFF)
includeLargeBlock = true;
#endif
}
largeBlockFlags = __ballot(includeLargeBlock);
loadedLargeBlocks = warpSize;
}
loadedLargeBlocks--;
if ((largeBlockFlags&1) == 0 && !lastIteration) {
// None of the next warpSize blocks interact with block 1.
largeBlockFlags >>= 1;
continue;
}
largeBlockFlags >>= 1;
#endif
int block2 = block2Base+indexInWarp;
bool includeBlock2 = (block1 < block2 && block2 < NUM_BLOCKS);
block2 = includeBlock2 ? block2 : block1;
bool forceInclude = false;
real4 blockCenterY = sortedBlockCenter[block2];
real4 blockDelta = blockCenterX-blockCenterY;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif
includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < (PADDED_CUTOFF+blockCenterX.w+blockCenterY.w)*(PADDED_CUTOFF+blockCenterX.w+blockCenterY.w));
#ifndef TRICLINIC
if (!lastIteration && __ballot(includeBlock2) == 0)
continue;
#endif
real3 blockSizeY = sortedBlockBoundingBox[block2].toReal3();
blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z);
includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
#ifdef TRICLINIC
// The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
// If there's any possibility we might have missed it, do a detailed check.
if (periodicBoxSize.z/2-blockSizeX.z-blockSizeY.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-blockSizeY.y < PADDED_CUTOFF)
includeBlock2 = forceInclude = true;
#endif
// Collect any blocks we identified as potentially containing neighbors.
warpflags includeBlockFlags = __ballot(includeBlock2);
if (includeBlock2) {
int index = block2Count + warpPopc(includeBlockFlags&warpMask);
block2Buffer[index] = (block2 << 1) | (forceInclude ? 1 : 0);
}
block2Count += warpPopc(includeBlockFlags);
// Loop over the collected candidates (each warp processes 2 blocks on CDNA or 1 block on RDNA).
// Process even number of blocks on CDNA so both half-warps have work to do (except for
// the last iteration of the for-block2Base loop when the left-over must be processed).
const int block2ToProcess = lastIteration ? block2Count : block2Count/tilesPerWarp*tilesPerWarp;
for (int block2Index = 0; block2Index < block2ToProcess; block2Index += tilesPerWarp) {
bool includeBlock2 = block2Index + tileInWarp < block2Count;
const int b = block2Buffer[min(block2Index + tileInWarp, block2Count - 1)];
const bool forceInclude = b & 1;
const int block2 = b >> 1;
int y = sortedBlocks[block2] & BLOCK_INDEX_MASK;
#pragma unroll 1
for (int k = indexInTile; k < numExclusions; k += TILE_SIZE)
includeBlock2 &= (exclusionsForX[k] != y);
includeBlock2 = BALLOT(!includeBlock2) == 0;
// Check each atom in block Y for interactions.
int atom2 = y*TILE_SIZE+indexInTile;
real4 pos2 = posq[atom2];
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos2, blockCenterX)
}
#endif
pos2.w = 0.5f * (pos2.x * pos2.x + pos2.y * pos2.y + pos2.z * pos2.z);
real4 blockCenterY = sortedBlockCenter[block2];
real3 atomDelta = trimTo3(pos1)-trimTo3(blockCenterY);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(atomDelta)
#endif
tileflags atomFlags = BALLOT(forceInclude || atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w));
tileflags interacts = 0;
// The condition `posj.w + pos2.w - posj.x*pos2.x - posj.y*pos2.y - posj.z*pos2.z < 0.5f * PADDED_CUTOFF_SQUARED` is expressed as
// `posj.x*pos2.x - posj.y*pos2.y - posj.z*pos2.z - posj.w - 0.5f * PADDED_CUTOFF_SQUARED - pos2.w` and computed using fma
// (it saves 1 instruction).
// Sign bit is used directly instead of `halfDist2 < 0.5f * PADDED_CUTOFF_SQUARED ? 1<<j : 0`.
#ifdef USE_PERIODIC
if (!singlePeriodicCopy) {
while (atomFlags) {
int j = __ffs(atomFlags)-1;
atomFlags = atomFlags ^ (static_cast<tileflags>(1) << j);
real3 delta = trimTo3(pos2)-trimTo3(posBuffer[j]);
APPLY_PERIODIC_TO_DELTA(delta)
real d = delta.x*delta.x+delta.y*delta.y+delta.z*delta.z - PADDED_CUTOFF_SQUARED;
collectInteractions(interacts, d, j);
}
}
else {
#endif
const real lim = 0.5f * PADDED_CUTOFF_SQUARED - pos2.w;
#if defined(USE_MFMA)
const vfloat c = { -lim, -lim, -lim, -lim };
mfma4x4<0>(pos1, pos2, c, interacts);
mfma4x4<1>(pos1, pos2, c, interacts);
mfma4x4<2>(pos1, pos2, c, interacts);
mfma4x4<3>(pos1, pos2, c, interacts);
mfma4x4<4>(pos1, pos2, c, interacts);
mfma4x4<5>(pos1, pos2, c, interacts);
mfma4x4<6>(pos1, pos2, c, interacts);
mfma4x4<7>(pos1, pos2, c, interacts);
#else
while (atomFlags) {
int j = __ffs(atomFlags)-1;
atomFlags = atomFlags ^ (static_cast<tileflags>(1) << j);
real4 posj = posBuffer[j];
real d = fma(-posj.x, pos2.x, fma(-posj.y, pos2.y, fma(-posj.z, pos2.z, posj.w - lim)));
collectInteractions(interacts, d, j);
}
#endif
#ifdef USE_PERIODIC
}
#endif
if (atom2 >= NUM_ATOMS || !includeBlock2) {
interacts = 0;
}
#if MAX_BITS_FOR_PAIRS > 0
const unsigned int interactCount = __popc(interacts);
// Record interactions that should be computed as single pairs rather than in blocks.
const bool storeAsSinglePair = interactCount > 0 && interactCount <= MAX_BITS_FOR_PAIRS;
if (__ballot(storeAsSinglePair)) {
unsigned int sum = 0;
unsigned int prevSum = 0;
for (int i = 1; i <= MAX_BITS_FOR_PAIRS; i++) {
warpflags b = __ballot(interactCount == i);
sum += warpPopc(b) * i;
prevSum += warpPopc(b&warpMask) * i;
}
unsigned int pairStartIndex = 0;
if (indexInWarp == 0)
pairStartIndex = atomicAdd(&interactionCount[1], sum);
pairStartIndex = __shfl(pairStartIndex, 0);
unsigned int pairIndex = pairStartIndex + prevSum;
if (storeAsSinglePair && pairIndex+interactCount <= maxSinglePairs) {
while (interacts != 0) {
int j = __ffs(interacts)-1;
singlePairs[pairIndex] = make_int2(atom2, x*TILE_SIZE+j);
interacts = interacts ^ (static_cast<tileflags>(1) << j);
pairIndex++;
}
}
}
#else
const unsigned int interactCount = interacts;
#endif
// Add any interacting atoms to the buffer.
warpflags includeAtomFlags = __ballot(interactCount > MAX_BITS_FOR_PAIRS);
if (interactCount > MAX_BITS_FOR_PAIRS) {
int index = neighborsInBuffer+warpPopc(includeAtomFlags&warpMask);
buffer[index] = atom2;
}
neighborsInBuffer += warpPopc(includeAtomFlags);
if (neighborsInBuffer > BUFFER_SIZE-warpSize) {
// Store the new tiles to memory.
unsigned int tilesToStore = neighborsInBuffer/warpSize*tilesPerWarp;
unsigned int tileStartIndex = 0;
if (indexInWarp == 0)
tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
unsigned int newTileStartIndex = __shfl(tileStartIndex, 0);
if (newTileStartIndex+tilesToStore <= maxTiles) {
if (indexInWarp < tilesToStore)
interactingTiles[newTileStartIndex+indexInWarp] = x;
for (int j = 0; j < tilesToStore/tilesPerWarp; j++)
interactingAtoms[newTileStartIndex*TILE_SIZE+j*warpSize+indexInWarp] = buffer[j*warpSize+indexInWarp];
}
if (indexInWarp+TILE_SIZE*tilesToStore < BUFFER_SIZE)
buffer[indexInWarp] = buffer[indexInWarp+TILE_SIZE*tilesToStore];
neighborsInBuffer -= TILE_SIZE*tilesToStore;
}
}
// Move not processed blocks to the head of block2Buffer.
if (indexInWarp < block2Count - block2ToProcess)
block2Buffer[indexInWarp] = block2Buffer[block2ToProcess + indexInWarp];
block2Count = block2Count - block2ToProcess;
}
// If we have a partially filled buffer, store it to memory.
if (neighborsInBuffer > 0) {
unsigned int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
unsigned int tileStartIndex = 0;
if (indexInWarp == 0)
tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
unsigned int newTileStartIndex = __shfl(tileStartIndex, 0);
if (newTileStartIndex+tilesToStore <= maxTiles) {
if (indexInWarp < tilesToStore)
interactingTiles[newTileStartIndex+indexInWarp] = x;
for (int j = 0; j <= tilesToStore/tilesPerWarp; j++) {
if (j*warpSize+indexInWarp < tilesToStore*TILE_SIZE)
interactingAtoms[newTileStartIndex*TILE_SIZE+j*warpSize+indexInWarp] = (j*warpSize+indexInWarp < neighborsInBuffer ? buffer[j*warpSize+indexInWarp] : PADDED_NUM_ATOMS);
}
}
}
}
// Record the positions the neighbor list is based on.
int i = threadIdx.x+blockIdx.x*GROUP_SIZE;
if (i < NUM_ATOMS) {
oldPositions[i] = posq[i];
}
}
extern "C" __global__ void copyInteractionCounts(const unsigned int* __restrict__ interactionCount,
unsigned int* __restrict__ pinnedInteractionCount) {
pinnedInteractionCount[0] = interactionCount[0];
pinnedInteractionCount[1] = interactionCount[1];
}
/**
* This file contains the device function for using cross-lane operations (ballot and shuffle)
*/
#if defined(TILE_SIZE)
#if !defined(AMD_RDNA)
// Two subwarps per warp
#define SHFL(var, srcLane) __shfl(var, (srcLane) & (TILE_SIZE - 1), TILE_SIZE)
#define BALLOT(var) (unsigned int)(__ballot(var) >> (threadIdx.x & ((64 - 1) ^ (TILE_SIZE - 1))))
#else
#define SHFL(var, srcLane) __shfl(var, srcLane)
#define BALLOT(var) __ballot(var)
#endif
#endif
template<class T>
static __inline__ __device__
T warpShuffle(const T& input, const int src_lane) {
static_assert(sizeof(T) % sizeof(int) == 0, "incorrect type size");
constexpr int words_no = sizeof(T) / sizeof(int);
T output;
#pragma unroll
for(int i = 0; i < words_no; i++) {
int word;
__builtin_memcpy(&word, reinterpret_cast<const char*>(&input) + i * sizeof(int), sizeof(int));
word = __builtin_amdgcn_ds_bpermute(src_lane << 2, word);
__builtin_memcpy(reinterpret_cast<char*>(&output) + i * sizeof(int), &word, sizeof(int));
}
return output;
}
template<int Subwarp, class T>
static __inline__ __device__
typename std::enable_if<(Subwarp == warpSize), T>::type
warpRotateLeft(const T& input) {
return warpShuffle(input, threadIdx.x + 1);
}
template<int Subwarp, class T>
static __inline__ __device__
typename std::enable_if<!(Subwarp == warpSize), T>::type
warpRotateLeft(const T& input) {
return warpShuffle(input, ((threadIdx.x + 1) & (Subwarp - 1)) | (threadIdx.x & ~(Subwarp - 1)));
}
/**
* Compute nonbonded interactions. The kernel is separated into two parts,
* tiles with exclusions and tiles without exclusions. It relies heavily on
* implicit warp-level synchronization. A tile is defined by two atom blocks
* each of warpsize. Each warp computes a range of tiles.
*
* Tiles with exclusions compute the entire set of interactions across
* atom blocks, equal to warpsize*warpsize. In order to avoid access conflicts
* the forces are computed and accumulated diagonally in the manner shown below
* where, suppose
*
* [a-h] comprise atom block 1, [i-p] comprise atom block 2
*
* 1 denotes the first set of calculations within the warp
* 2 denotes the second set of calculations within the warp
* ... etc.
*
* threads
* 0 1 2 3 4 5 6 7
* atom1
* L a b c d e f g h
* o i 1 2 3 4 5 6 7 8
* c j 8 1 2 3 4 5 6 7
* a k 7 8 1 2 3 4 5 6
* l l 6 7 8 1 2 3 4 5
* D m 5 6 7 8 1 2 3 4
* a n 4 5 6 7 8 1 2 3
* t o 3 4 5 6 7 8 1 2
* a p 2 3 4 5 6 7 8 1
*
* Tiles without exclusions read off directly from the neighbourlist interactingAtoms
* and follows the same force accumulation method. If more there are more interactingTiles
* than the size of the neighbourlist initially allocated, the neighbourlist is rebuilt
* and the full tileset is computed. This should happen on the first step, and very rarely
* afterwards.
*
* On diagonal exclusion tiles use __shfl to broadcast. For all other types of tiles __shfl
* is used to pass around the forces, positions, and parameters when computing the forces.
*
* [out]forceBuffers - forces on each atom to eventually be accumulated
* [out]energyBuffer - energyBuffer to eventually be accumulated
* [in]posq - x,y,z,charge
* [in]exclusions - 1024-bit flags denoting atom-atom exclusions for each tile
* [in]exclusionTiles - x,y denotes the indices of tiles that have an exclusion
* [in]startTileIndex - index into first tile to be processed
* [in]numTileIndices - number of tiles this context is responsible for processing
* [in]int tiles - the atom block for each tile
* [in]interactionCount - total number of tiles that have an interaction
* [in]maxTiles - stores the size of the neighbourlist in case it needs
* - to be expanded
* [in]periodicBoxSize - size of the Periodic Box, last dimension (w) not used
* [in]invPeriodicBox - inverse of the periodicBoxSize, pre-computed for speed
* [in]blockCenter - the center of each block in euclidean coordinates
* [in]blockSize - size of the each block, radiating from the center
* - x is half the distance of total length
* - y is half the distance of total width
* - z is half the distance of total height
* - w is not used
* [in]interactingAtoms - a list of interactions within a given tile
*
*/
extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded(
unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
const int2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned long long numTileIndices
#ifdef USE_CUTOFF
, const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms, unsigned int maxSinglePairs,
const int2* __restrict__ singlePairs
#endif
PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = blockIdx.x*(THREAD_BLOCK_SIZE/TILE_SIZE) + threadIdx.x/TILE_SIZE; // global warpIndex
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = threadIdx.x - tgx; // block warpIndex
mixed energy = 0;
INIT_DERIVATIVES
// First loop: process tiles that contain exclusions.
for (int pos = FIRST_EXCLUSION_TILE+warp; pos < LAST_EXCLUSION_TILE; pos+=totalWarps) {
const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real3 force = make_real3(0);
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
tileflags excl = exclusions[pos*TILE_SIZE+tgx];
#endif
if (x == y) {
// This tile is on the diagonal.
real4 shflPosq = posq1;
// we do not need to fetch parameters from global since this is a symmetric tile
// instead we can broadcast the values using shuffle
#pragma unroll 2
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2;
BROADCAST_WARP_DATA
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < MAX_CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
int atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
real dEdR = 0.0f;
#else
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
#endif
#ifdef USE_EXCLUSIONS
bool isExcluded = !(atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && (excl & 1));
#endif
real tempEnergy = 0.0f;
const real interactionScale = 0.5f;
COMPUTE_INTERACTION
#ifdef INCLUDE_ENERGY
energy += 0.5f*tempEnergy;
#endif
#ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC
force.x -= delta.x*dEdR;
force.y -= delta.y*dEdR;
force.z -= delta.z*dEdR;
#else
force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
#endif
#endif
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
}
else {
// This is an off-diagonal tile.
unsigned int j = y*TILE_SIZE + tgx;
int atomIndex = j;
real4 shflPosq = posq[j];
real3 shflForce;
shflForce.x = 0.0f;
shflForce.y = 0.0f;
shflForce.z = 0.0f;
DECLARE_LOCAL_PARAMETERS
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
#ifdef USE_EXCLUSIONS
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif
#pragma unroll 2
for (j = 0; j < TILE_SIZE; j++) {
real4 posq2 = shflPosq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < MAX_CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
int atom2 = atomIndex;
#ifdef USE_SYMMETRIC
real dEdR = 0.0f;
#else
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
#endif
#ifdef USE_EXCLUSIONS
bool isExcluded = !(atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && (excl & 1));
#endif
real tempEnergy = 0.0f;
const real interactionScale = 1.0f;
COMPUTE_INTERACTION
#ifdef INCLUDE_ENERGY
energy += tempEnergy;
#endif
#ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
shflForce.x += delta.x;
shflForce.y += delta.y;
shflForce.z += delta.z;
#else // !USE_SYMMETRIC
force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y;
shflForce.z += dEdR2.z;
#endif // end USE_SYMMETRIC
#endif
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
SHUFFLE_WARP_DATA
atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex);
}
const unsigned int offset = atomIndex;
// write results for off diagonal tiles
#ifdef INCLUDE_FORCES
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>(realToFixedPoint(shflForce.x)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.y)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.z)));
#endif
}
// Write results for on and off diagonal tiles
#ifdef INCLUDE_FORCES
const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>(realToFixedPoint(force.x)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.y)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.z)));
#endif
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_NEIGHBOR_LIST
const unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
for (unsigned int pos0 = warp; pos0 < LAST_EXCLUSION_TILE+numTiles; pos0+=totalWarps) {
// Skip warps that may be still busy in the first loop
if (pos0 < LAST_EXCLUSION_TILE) {
continue;
}
const unsigned int pos = pos0-LAST_EXCLUSION_TILE;
#else
int pos = (int) (startTileIndex+warp*numTileIndices/totalWarps);
int end = (int) (startTileIndex+(warp+1)*numTileIndices/totalWarps);
int skipBase = 0;
int currentSkipIndex = tbx;
int skipTiles;
skipTiles = -1;
for (; pos < end; pos++) {
#endif
real3 force = make_real3(0);
bool includeTile = true;
// Extract the coordinates of this tile.
int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_NEIGHBOR_LIST
x = tiles[pos];
real4 blockSizeX = blockSize[x];
singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF &&
0.5f*periodicBoxSize.y-blockSizeX.y >= MAX_CUTOFF &&
0.5f*periodicBoxSize.z-blockSizeX.z >= MAX_CUTOFF);
#else
y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
while (SHFL(skipTiles, TILE_SIZE-1) < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
int2 tile = exclusionTiles[skipBase+tgx];
skipTiles = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles = end;
skipBase += TILE_SIZE;
currentSkipIndex = 0;
}
while (SHFL(skipTiles, currentSkipIndex) < pos)
currentSkipIndex++;
includeTile = (SHFL(skipTiles, currentSkipIndex) != pos);
#endif
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
#ifdef USE_NEIGHBOR_LIST
unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
int atomIndex = j;
DECLARE_LOCAL_PARAMETERS
real4 shflPosq;
real3 shflForce;
shflForce.x = 0.0f;
shflForce.y = 0.0f;
shflForce.z = 0.0f;
if (j < PADDED_NUM_ATOMS) {
// Load position of atom j from from global memory
shflPosq = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
else {
shflPosq = make_real4(0, 0, 0, 0);
CLEAR_LOCAL_PARAMETERS
}
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(shflPosq, blockCenterX)
#pragma unroll 2
for (j = 0; j < TILE_SIZE; j++) {
real4 posq2 = shflPosq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < MAX_CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
int atom2 = atomIndex;
#ifdef USE_SYMMETRIC
real dEdR = 0.0f;
#else
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
#endif
#ifdef USE_EXCLUSIONS
bool isExcluded = !(atom1 < NUM_ATOMS && atom2 < NUM_ATOMS);
#endif
real tempEnergy = 0.0f;
const real interactionScale = 1.0f;
COMPUTE_INTERACTION
#ifdef INCLUDE_ENERGY
energy += tempEnergy;
#endif
#ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
shflForce.x += delta.x;
shflForce.y += delta.y;
shflForce.z += delta.z;
#else // !USE_SYMMETRIC
force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y;
shflForce.z += dEdR2.z;
#endif // end USE_SYMMETRIC
#endif
#ifdef USE_CUTOFF
}
#endif
SHUFFLE_WARP_DATA
atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex);
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
#pragma unroll 2
for (j = 0; j < TILE_SIZE; j++) {
real4 posq2 = shflPosq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < MAX_CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
int atom2 = atomIndex;
#ifdef USE_SYMMETRIC
real dEdR = 0.0f;
#else
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
#endif
#ifdef USE_EXCLUSIONS
bool isExcluded = !(atom1 < NUM_ATOMS && atom2 < NUM_ATOMS);
#endif
real tempEnergy = 0.0f;
const real interactionScale = 1.0f;
COMPUTE_INTERACTION
#ifdef INCLUDE_ENERGY
energy += tempEnergy;
#endif
#ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
shflForce.x += delta.x;
shflForce.y += delta.y;
shflForce.z += delta.z;
#else // !USE_SYMMETRIC
force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
shflForce.x += dEdR2.x;
shflForce.y += dEdR2.y;
shflForce.z += dEdR2.z;
#endif // end USE_SYMMETRIC
#endif
#ifdef USE_CUTOFF
}
#endif
SHUFFLE_WARP_DATA
atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex);
}
}
// Write results.
#ifdef INCLUDE_FORCES
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>(realToFixedPoint(force.x)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.y)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.z)));
unsigned int atom2 = atomIndex;
if (atom2 < PADDED_NUM_ATOMS) {
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>(realToFixedPoint(shflForce.x)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.y)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.z)));
}
#endif
}
}
// Third loop: single pairs that aren't part of a tile.
#if USE_NEIGHBOR_LIST
const unsigned int numPairs = interactionCount[1];
if (numPairs > maxSinglePairs)
return; // There wasn't enough memory for the neighbor list.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numPairs; i += blockDim.x*gridDim.x) {
int2 pair = singlePairs[i];
int atom1 = pair.x;
int atom2 = pair.y;
real4 posq1 = posq[atom1];
real4 posq2 = posq[atom2];
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < MAX_CUTOFF_SQUARED) {
LOAD_ATOM1_PARAMETERS
int j = atom2;
DECLARE_LOCAL_PARAMETERS
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_ATOM2_PARAMETERS
real invR = RSQRT(r2);
real r = r2*invR;
#ifdef USE_SYMMETRIC
real dEdR = 0.0f;
#else
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
#endif
bool isExcluded = false;
real tempEnergy = 0.0f;
const real interactionScale = 1.0f;
COMPUTE_INTERACTION
#ifdef INCLUDE_ENERGY
energy += tempEnergy;
#endif
#ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC
real3 dEdR1 = delta*dEdR;
real3 dEdR2 = -dEdR1;
#endif
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>(realToFixedPoint(-dEdR1.x)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(-dEdR1.y)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(-dEdR1.z)));
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>(realToFixedPoint(-dEdR2.x)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(-dEdR2.y)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(-dEdR2.z)));
#endif
}
}
#endif
#ifdef INCLUDE_ENERGY
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
#endif
SAVE_DERIVATIVES
}
/**
* Sum the forces computed by different contexts.
*/
extern "C" __global__ void sumForces(long long* __restrict__ force, long long* __restrict__ buffer, int bufferSize, int numBuffers) {
int totalSize = bufferSize*numBuffers;
for (int index = blockDim.x*blockIdx.x+threadIdx.x; index < bufferSize; index += blockDim.x*gridDim.x) {
long long sum = force[index];
for (int i = index; i < totalSize; i += bufferSize)
sum += buffer[i];
force[index] = sum;
}
}
__device__ KEY_TYPE getValue(DATA_TYPE value) {
return SORT_KEY;
}
extern "C" {
/**
* Sort a list that is short enough to entirely fit in local memory. This is executed as
* a single thread block.
*/
__global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length) {
// Load the data into local memory.
extern __shared__ DATA_TYPE dataBuffer[];
for (int index = threadIdx.x; index < length; index += blockDim.x)
dataBuffer[index] = data[index];
__syncthreads();
// Perform a bitonic sort in local memory.
unsigned int lenghtNextPow2 = length <= 2 ? length : (1 << (32 - __clz(length - 1)));
for (unsigned int k = 2; k <= lenghtNextPow2; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) {
for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
int ixj = i^j;
if (ixj > i && ixj < length) {
DATA_TYPE value1 = dataBuffer[i];
DATA_TYPE value2 = dataBuffer[ixj];
bool ascending = (__popc(~i & (lenghtNextPow2 - k)) & 1) == 0;
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) {
dataBuffer[i] = value2;
dataBuffer[ixj] = value1;
}
}
}
__syncthreads();
}
}
// Write the data back to global memory.
for (int index = threadIdx.x; index < length; index += blockDim.x)
data[index] = dataBuffer[index];
}
/**
* An alternate kernel for sorting short lists. In this version every thread does a full
* scan through the data to select the destination for one element. This involves more
* work, but also parallelizes much better.
*/
__global__ void sortShortList2(const DATA_TYPE* __restrict__ dataIn, DATA_TYPE* __restrict__ dataOut, unsigned int length) {
extern __shared__ KEY_TYPE keyBuffer[];
int globalId = blockDim.x*blockIdx.x+threadIdx.x;
DATA_TYPE value = dataIn[globalId < length ? globalId : 0];
KEY_TYPE key = getValue(value);
int count = 0;
for (int blockStart = 0; blockStart < length; blockStart += blockDim.x) {
int numInBlock = min(static_cast<int>(blockDim.x), static_cast<int>(length-blockStart));
__syncthreads();
if (threadIdx.x < numInBlock)
keyBuffer[threadIdx.x] = getValue(dataIn[blockStart+threadIdx.x]);
__syncthreads();
for (int i = 0; i < numInBlock; i++) {
KEY_TYPE otherKey = keyBuffer[i];
count += (otherKey < key) | ((otherKey == key) & (blockStart+i < globalId));
}
}
if (globalId < length)
dataOut[count] = value;
}
inline __device__ void reduceMinMax(KEY_TYPE minimum, KEY_TYPE maximum, KEY_TYPE* minBuffer, KEY_TYPE* maxBuffer,
KEY_TYPE* minResult, KEY_TYPE* maxResult) {
minBuffer[threadIdx.x] = minimum;
maxBuffer[threadIdx.x] = maximum;
__syncthreads();
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
if ((threadIdx.x+step < blockDim.x) & ((threadIdx.x&(2*step-1)) == 0)) {
minBuffer[threadIdx.x] = min(minBuffer[threadIdx.x], minBuffer[threadIdx.x+step]);
maxBuffer[threadIdx.x] = max(maxBuffer[threadIdx.x], maxBuffer[threadIdx.x+step]);
}
__syncthreads();
}
if (threadIdx.x == 0) {
*minResult = minBuffer[0];
*maxResult = maxBuffer[0];
}
}
/**
* Calculate the minimum and maximum value in the array to be sorted.
*/
__global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int length, KEY_TYPE* __restrict__ range,
unsigned int numBuckets, unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ counters) {
#if UNIFORM
extern __shared__ KEY_TYPE minBuffer[];
KEY_TYPE* maxBuffer = minBuffer+blockDim.x;
KEY_TYPE minimum = MAX_KEY;
KEY_TYPE maximum = MIN_KEY;
__shared__ bool isLastFinishedBlock;
if (threadIdx.x == 0) {
isLastFinishedBlock = false;
}
// Each thread calculates the range of a subset of values.
for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
KEY_TYPE value = getValue(data[index]);
minimum = min(minimum, value);
maximum = max(maximum, value);
}
// Now reduce them and save partial results
reduceMinMax(minimum, maximum, minBuffer, maxBuffer, &range[blockIdx.x * 2 + 0], &range[blockIdx.x * 2 + 1]);
__threadfence();
if (threadIdx.x == 0) {
isLastFinishedBlock = atomicAdd(&counters[0], 1) + 1 == gridDim.x;
}
__syncthreads();
// The last block reduce partial results
if (isLastFinishedBlock) {
for (unsigned int index = threadIdx.x; index < gridDim.x; index += blockDim.x) {
minimum = min(minimum, range[index * 2 + 0]);
maximum = max(maximum, range[index * 2 + 1]);
}
reduceMinMax(minimum, maximum, minBuffer, maxBuffer, &range[0], &range[1]);
}
#endif
// Clear the bucket counters in preparation for the next kernel.
for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < numBuckets; index += blockDim.x*gridDim.x)
bucketOffset[index] = 0;
}
/**
* Assign elements to buckets. This version is optimized for uniformly distributed data.
*/
__global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range,
unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) {
float minValue = (float) (range[0]);
float maxValue = (float) (range[1]);
float bucketWidth = (maxValue-minValue)/numBuckets;
for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
float key = (float) getValue(data[index]);
unsigned int bucketIndex = min((unsigned int) ((key-minValue)/bucketWidth), numBuckets-1);
offsetInBucket[index] = atomicAdd(&bucketOffset[bucketIndex], 1);
bucketOfElement[index] = bucketIndex;
}
}
/**
* Assign elements to buckets. This version is optimized for non-uniformly distributed data.
*/
__global__ void assignElementsToBuckets2(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range,
unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) {
// Load 64 datapoints and sort them to get an estimate of the data distribution.
__shared__ KEY_TYPE elements[64];
if (threadIdx.x < 64) {
int index = (int) (threadIdx.x*length/64.0);
elements[threadIdx.x] = getValue(data[index]);
}
__syncthreads();
for (unsigned int k = 2; k <= 64; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) {
if (threadIdx.x < 64) {
int ixj = threadIdx.x^j;
if (ixj > threadIdx.x) {
KEY_TYPE value1 = elements[threadIdx.x];
KEY_TYPE value2 = elements[ixj];
bool ascending = (threadIdx.x&k) == 0;
KEY_TYPE lowKey = (ascending ? value1 : value2);
KEY_TYPE highKey = (ascending ? value2 : value1);
if (lowKey > highKey) {
elements[threadIdx.x] = value2;
elements[ixj] = value1;
}
}
}
__syncthreads();
}
}
// Create a function composed of linear segments mapping data values to bucket indices.
__shared__ float segmentLowerBound[9];
__shared__ float segmentBaseIndex[9];
__shared__ float segmentIndexScale[9];
if (threadIdx.x == 0) {
segmentLowerBound[0] = elements[0]-0.2f*(elements[5]-elements[0]);
segmentLowerBound[1] = elements[5];
segmentLowerBound[2] = elements[10];
segmentLowerBound[3] = elements[20];
segmentLowerBound[4] = elements[30];
segmentLowerBound[5] = elements[40];
segmentLowerBound[6] = elements[50];
segmentLowerBound[7] = elements[60];
segmentLowerBound[8] = elements[63]+0.2f*(elements[63]-elements[58]);
segmentBaseIndex[0] = numBuckets/16;
segmentBaseIndex[1] = 3*numBuckets/16;
segmentBaseIndex[2] = 5*numBuckets/16;
segmentBaseIndex[3] = 7*numBuckets/16;
segmentBaseIndex[4] = 9*numBuckets/16;
segmentBaseIndex[5] = 11*numBuckets/16;
segmentBaseIndex[6] = 13*numBuckets/16;
segmentBaseIndex[7] = 15*numBuckets/16;
segmentBaseIndex[8] = numBuckets;
for (int i = 0; i < 8; i++)
if (segmentLowerBound[i+1] == segmentLowerBound[i])
segmentIndexScale[i] = 0;
else
segmentIndexScale[i] = (segmentBaseIndex[i+1]-segmentBaseIndex[i])/(segmentLowerBound[i+1]-segmentLowerBound[i]);
}
__syncthreads();
// Assign elements to buckets.
for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
float key = (float) getValue(data[index]);
int segment;
for (segment = 0; segment < 7 && key > segmentLowerBound[segment+1]; segment++)
;
unsigned int bucketIndex = segmentBaseIndex[segment]+(key-segmentLowerBound[segment])*segmentIndexScale[segment];
bucketIndex = min(max(0, bucketIndex), numBuckets-1);
offsetInBucket[index] = atomicAdd(&bucketOffset[bucketIndex], 1);
bucketOfElement[index] = bucketIndex;
}
}
/**
* Sum the bucket sizes to compute the start position of each bucket. This kernel
* is executed as a single work group.
*/
__global__ __launch_bounds__(1024) void computeBucketPositions(unsigned int numBuckets, unsigned int* __restrict__ bucketOffset, unsigned int* __restrict__ counters) {
extern __shared__ unsigned int posBuffer[];
unsigned int globalOffset = 0;
for (unsigned int startBucket = 0; startBucket < numBuckets; startBucket += blockDim.x) {
// Load the bucket sizes into local memory.
unsigned int globalIndex = startBucket+threadIdx.x;
__syncthreads();
posBuffer[threadIdx.x] = (globalIndex < numBuckets ? bucketOffset[globalIndex] : 0);
__syncthreads();
// Perform a parallel prefix sum.
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
unsigned int add = (threadIdx.x >= step ? posBuffer[threadIdx.x-step] : 0);
__syncthreads();
posBuffer[threadIdx.x] += add;
__syncthreads();
}
// Write the results back to global memory.
if (globalIndex < numBuckets)
bucketOffset[globalIndex] = posBuffer[threadIdx.x]+globalOffset;
globalOffset += posBuffer[blockDim.x-1];
}
if (threadIdx.x == 0) {
counters[0] = 0;
}
}
/**
* Copy the input data into the buckets for sorting.
*/
__global__ void copyDataToBuckets(const DATA_TYPE* __restrict__ data, DATA_TYPE* __restrict__ buckets, unsigned int length, const unsigned int* __restrict__ bucketOffset, const unsigned int* __restrict__ bucketOfElement, const unsigned int* __restrict__ offsetInBucket) {
for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
DATA_TYPE element = data[index];
unsigned int bucketIndex = bucketOfElement[index];
unsigned int offset = (bucketIndex == 0 ? 0 : bucketOffset[bucketIndex-1]);
buckets[offset+offsetInBucket[index]] = element;
}
}
/**
* Sort the data in each bucket.
*/
__global__ void sortBuckets(DATA_TYPE* __restrict__ data, const DATA_TYPE* __restrict__ buckets, const unsigned int* __restrict__ bucketOffset) {
extern __shared__ DATA_TYPE dataBuffer[];
unsigned int index = blockIdx.x;
unsigned int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
unsigned int endIndex = bucketOffset[index];
unsigned int length = endIndex-startIndex;
unsigned int lenghtNextPow2 = length <= 2 ? length : (1 << (32 - __clz(length - 1)));
if (length <= blockDim.x) {
// Load the data into local memory.
if (threadIdx.x < length)
dataBuffer[threadIdx.x] = buckets[startIndex+threadIdx.x];
else if (threadIdx.x < lenghtNextPow2)
dataBuffer[threadIdx.x] = MAX_VALUE;
__syncthreads();
// Perform a bitonic sort in local memory.
for (unsigned int k = 2; k <= lenghtNextPow2; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) {
int ixj = threadIdx.x^j;
if (threadIdx.x < lenghtNextPow2 && ixj > threadIdx.x) {
DATA_TYPE value1 = dataBuffer[threadIdx.x];
DATA_TYPE value2 = dataBuffer[ixj];
bool ascending = (threadIdx.x&k) == 0;
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) {
dataBuffer[threadIdx.x] = value2;
dataBuffer[ixj] = value1;
}
}
__syncthreads();
}
}
// Write the data to the sorted array.
if (threadIdx.x < length)
data[startIndex+threadIdx.x] = dataBuffer[threadIdx.x];
}
else {
// Copy the bucket data over to the output array.
for (unsigned int i = threadIdx.x; i < length; i += blockDim.x)
data[startIndex+i] = buckets[startIndex+i];
__syncthreads();
// Perform a bitonic sort in global memory.
for (unsigned int k = 2; k <= lenghtNextPow2; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) {
for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
int ixj = i^j;
if (ixj > i && ixj < length) {
DATA_TYPE value1 = data[startIndex+i];
DATA_TYPE value2 = data[startIndex+ixj];
bool ascending = (__popc(~i & (lenghtNextPow2 - k)) & 1) == 0;
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) {
data[startIndex+i] = value2;
data[startIndex+ixj] = value1;
}
}
}
__syncthreads();
}
}
}
}
}
extern "C" {
/**
* This is called by the various functions below to clear a buffer.
*/
__device__ void clearSingleBuffer(int* __restrict__ buffer, int size) {
int index = blockDim.x*blockIdx.x+threadIdx.x;
int4* buffer4 = (int4*) buffer;
int sizeDiv4 = size/4;
while (index < sizeDiv4) {
buffer4[index] = make_int4(0);
index += blockDim.x*gridDim.x;
}
if (blockDim.x*blockIdx.x+threadIdx.x == 0)
for (int i = sizeDiv4*4; i < size; i++)
buffer[i] = 0;
}
/**
* Fill a buffer with 0.
*/
__global__ void clearBuffer(int* __restrict__ buffer, int size) {
clearSingleBuffer(buffer, size);
}
/**
* Fill two buffers with 0.
*/
__global__ void clearTwoBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2) {
clearSingleBuffer(buffer1, size1);
clearSingleBuffer(buffer2, size2);
}
/**
* Fill three buffers with 0.
*/
__global__ void clearThreeBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3) {
clearSingleBuffer(buffer1, size1);
clearSingleBuffer(buffer2, size2);
clearSingleBuffer(buffer3, size3);
}
/**
* Fill four buffers with 0.
*/
__global__ void clearFourBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3, int* __restrict__ buffer4, int size4) {
clearSingleBuffer(buffer1, size1);
clearSingleBuffer(buffer2, size2);
clearSingleBuffer(buffer3, size3);
clearSingleBuffer(buffer4, size4);
}
/**
* Fill five buffers with 0.
*/
__global__ void clearFiveBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3, int* __restrict__ buffer4, int size4, int* __restrict__ buffer5, int size5) {
clearSingleBuffer(buffer1, size1);
clearSingleBuffer(buffer2, size2);
clearSingleBuffer(buffer3, size3);
clearSingleBuffer(buffer4, size4);
clearSingleBuffer(buffer5, size5);
}
/**
* Fill six buffers with 0.
*/
__global__ void clearSixBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3, int* __restrict__ buffer4, int size4, int* __restrict__ buffer5, int size5, int* __restrict__ buffer6, int size6) {
clearSingleBuffer(buffer1, size1);
clearSingleBuffer(buffer2, size2);
clearSingleBuffer(buffer3, size3);
clearSingleBuffer(buffer4, size4);
clearSingleBuffer(buffer5, size5);
clearSingleBuffer(buffer6, size6);
}
/**
* Sum the energy buffer.
*/
__global__ void reduceEnergy(const mixed* __restrict__ energyBuffer, mixed* __restrict__ result, int bufferSize, int workGroupSize) {
extern __shared__ mixed tempBuffer[];
const unsigned int thread = threadIdx.x;
mixed sum = 0;
for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < bufferSize; index += blockDim.x*gridDim.x)
sum += energyBuffer[index];
tempBuffer[thread] = sum;
for (int i = 1; i < workGroupSize; i *= 2) {
__syncthreads();
if (thread%(i*2) == 0 && thread+i < workGroupSize)
tempBuffer[thread] += tempBuffer[thread+i];
}
if (thread == 0)
result[blockIdx.x] = tempBuffer[0];
}
/**
* Record the atomic charges into the posq array.
*/
__global__ void setCharges(real* __restrict__ charges, real4* __restrict__ posq, int* __restrict__ atomOrder, int numAtoms) {
for (int i = blockDim.x*blockIdx.x+threadIdx.x; i < numAtoms; i += blockDim.x*gridDim.x)
posq[i].w = charges[atomOrder[i]];
}
}
/**
* This file defines vector operations to simplify code elsewhere.
*/
// Versions of make_x() that take a single value and set all components to that.
inline __device__ int2 make_int2(int a) {
return make_int2(a, a);
}
inline __device__ int3 make_int3(int a) {
return make_int3(a, a, a);
}
inline __device__ int4 make_int4(int a) {
return make_int4(a, a, a, a);
}
inline __device__ float2 make_float2(float a) {
return make_float2(a, a);
}
inline __device__ float3 make_float3(float a) {
return make_float3(a, a, a);
}
inline __device__ float4 make_float4(float a) {
return make_float4(a, a, a, a);
}
inline __device__ double2 make_double2(double a) {
return make_double2(a, a);
}
inline __device__ double3 make_double3(double a) {
return make_double3(a, a, a);
}
inline __device__ double4 make_double4(double a) {
return make_double4(a, a, a, a);
}
// Multiply a vector by a constant.
inline __device__ int2 operator*(int2 a, int b) {
return make_int2(a.x*b, a.y*b);
}
inline __device__ int3 operator*(int3 a, int b) {
return make_int3(a.x*b, a.y*b, a.z*b);
}
inline __device__ int4 operator*(int4 a, int b) {
return make_int4(a.x*b, a.y*b, a.z*b, a.w*b);
}
inline __device__ int2 operator*(int a, int2 b) {
return make_int2(a*b.x, a*b.y);
}
inline __device__ int3 operator*(int a, int3 b) {
return make_int3(a*b.x, a*b.y, a*b.z);
}
inline __device__ int4 operator*(int a, int4 b) {
return make_int4(a*b.x, a*b.y, a*b.z, a*b.w);
}
inline __device__ float2 operator*(float2 a, float b) {
return make_float2(a.x*b, a.y*b);
}
inline __device__ float3 operator*(float3 a, float b) {
return make_float3(a.x*b, a.y*b, a.z*b);
}
inline __device__ float4 operator*(float4 a, float b) {
return make_float4(a.x*b, a.y*b, a.z*b, a.w*b);
}
inline __device__ float2 operator*(float a, float2 b) {
return make_float2(a*b.x, a*b.y);
}
inline __device__ float3 operator*(float a, float3 b) {
return make_float3(a*b.x, a*b.y, a*b.z);
}
inline __device__ float4 operator*(float a, float4 b) {
return make_float4(a*b.x, a*b.y, a*b.z, a*b.w);
}
inline __device__ double2 operator*(double2 a, double b) {
return make_double2(a.x*b, a.y*b);
}
inline __device__ double3 operator*(double3 a, double b) {
return make_double3(a.x*b, a.y*b, a.z*b);
}
inline __device__ double4 operator*(double4 a, double b) {
return make_double4(a.x*b, a.y*b, a.z*b, a.w*b);
}
inline __device__ double2 operator*(double a, double2 b) {
return make_double2(a*b.x, a*b.y);
}
inline __device__ double3 operator*(double a, double3 b) {
return make_double3(a*b.x, a*b.y, a*b.z);
}
inline __device__ double4 operator*(double a, double4 b) {
return make_double4(a*b.x, a*b.y, a*b.z, a*b.w);
}
// Divide a vector by a constant.
inline __device__ int2 operator/(int2 a, int b) {
return make_int2(a.x/b, a.y/b);
}
inline __device__ int3 operator/(int3 a, int b) {
return make_int3(a.x/b, a.y/b, a.z/b);
}
inline __device__ int4 operator/(int4 a, int b) {
return make_int4(a.x/b, a.y/b, a.z/b, a.w/b);
}
inline __device__ float2 operator/(float2 a, float b) {
float scale = 1.0f/b;
return a*scale;
}
inline __device__ float3 operator/(float3 a, float b) {
float scale = 1.0f/b;
return a*scale;
}
inline __device__ float4 operator/(float4 a, float b) {
float scale = 1.0f/b;
return a*scale;
}
inline __device__ double2 operator/(double2 a, double b) {
double scale = 1.0/b;
return a*scale;
}
inline __device__ double3 operator/(double3 a, double b) {
double scale = 1.0/b;
return a*scale;
}
inline __device__ double4 operator/(double4 a, double b) {
double scale = 1.0/b;
return a*scale;
}
// *= operator (multiply vector by constant)
inline __device__ void operator*=(int2& a, int b) {
a.x *= b; a.y *= b;
}
inline __device__ void operator*=(int3& a, int b) {
a.x *= b; a.y *= b; a.z *= b;
}
inline __device__ void operator*=(int4& a, int b) {
a.x *= b; a.y *= b; a.z *= b; a.w *= b;
}
inline __device__ void operator*=(float2& a, float b) {
a.x *= b; a.y *= b;
}
inline __device__ void operator*=(float3& a, float b) {
a.x *= b; a.y *= b; a.z *= b;
}
inline __device__ void operator*=(float4& a, float b) {
a.x *= b; a.y *= b; a.z *= b; a.w *= b;
}
inline __device__ void operator*=(double2& a, double b) {
a.x *= b; a.y *= b;
}
inline __device__ void operator*=(double3& a, double b) {
a.x *= b; a.y *= b; a.z *= b;
}
inline __device__ void operator*=(double4& a, double b) {
a.x *= b; a.y *= b; a.z *= b; a.w *= b;
}
// Dot product
inline __device__ float dot(float3 a, float3 b) {
return a.x*b.x+a.y*b.y+a.z*b.z;
}
inline __device__ double dot(double3 a, double3 b) {
return a.x*b.x+a.y*b.y+a.z*b.z;
}
// Cross product
inline __device__ float3 cross(float3 a, float3 b) {
return make_float3(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
}
inline __device__ float4 cross(float4 a, float4 b) {
return make_float4(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x, 0.0f);
}
inline __device__ double3 cross(double3 a, double3 b) {
return make_double3(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
}
inline __device__ double4 cross(double4 a, double4 b) {
return make_double4(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x, 0.0);
}
// Normalize a vector
inline __device__ float2 normalize(float2 a) {
return a*rsqrtf(a.x*a.x+a.y*a.y);
}
inline __device__ float3 normalize(float3 a) {
return a*rsqrtf(a.x*a.x+a.y*a.y+a.z*a.z);
}
inline __device__ float4 normalize(float4 a) {
return a*rsqrtf(a.x*a.x+a.y*a.y+a.z*a.z+a.w*a.w);
}
inline __device__ double2 normalize(double2 a) {
return a*rsqrt(a.x*a.x+a.y*a.y);
}
inline __device__ double3 normalize(double3 a) {
return a*rsqrt(a.x*a.x+a.y*a.y+a.z*a.z);
}
inline __device__ double4 normalize(double4 a) {
return a*rsqrt(a.x*a.x+a.y*a.y+a.z*a.z+a.w*a.w);
}
// Strip off the fourth component of a vector.
inline __device__ short3 trimTo3(short4 v) {
return make_short3(v.x, v.y, v.z);
}
inline __device__ int3 trimTo3(int4 v) {
return make_int3(v.x, v.y, v.z);
}
inline __device__ float3 trimTo3(float4 v) {
return make_float3(v.x, v.y, v.z);
}
inline __device__ double3 trimTo3(double4 v) {
return make_double3(v.x, v.y, v.z);
}
#
# Testing
#
ENABLE_TESTING()
SET(OPENMM_BUILD_HIP_DOUBLE_PRECISION_TESTS TRUE CACHE BOOL "Whether to build double precision versions of HIP test cases")
SET( INCLUDE_SERIALIZATION FALSE )
#SET( INCLUDE_SERIALIZATION TRUE )
IF( INCLUDE_SERIALIZATION )
INCLUDE_DIRECTORIES(${OPENMM_DIR}/serialization/include)
SET( SHARED_OPENMM_SERIALIZATION "OpenMMSerialization" )
ENDIF( INCLUDE_SERIALIZATION )
# Automatically create tests using files named "Test*.cpp"
FILE(GLOB TEST_PROGS "*Test*.cpp")
FOREACH(TEST_PROG ${TEST_PROGS})
GET_FILENAME_COMPONENT(TEST_ROOT ${TEST_PROG} NAME_WE)
# Link with shared library
ADD_EXECUTABLE(${TEST_ROOT} ${TEST_PROG})
TARGET_LINK_LIBRARIES(${TEST_ROOT} ${SHARED_TARGET})
SET_TARGET_PROPERTIES(${TEST_ROOT} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
ADD_TEST(${TEST_ROOT}Single ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} single)
IF(OPENMM_BUILD_HIP_DOUBLE_PRECISION_TESTS)
ADD_TEST(${TEST_ROOT}Mixed ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} mixed)
ADD_TEST(${TEST_ROOT}Double ${EXECUTABLE_OUTPUT_PATH}/${TEST_ROOT} double)
ENDIF(OPENMM_BUILD_HIP_DOUBLE_PRECISION_TESTS)
ENDFOREACH(TEST_PROG ${TEST_PROGS})
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2015-2016 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#ifdef WIN32
#define _USE_MATH_DEFINES // Needed to get M_PI
#endif
#include "HipPlatform.h"
#include <string>
OpenMM::HipPlatform platform;
void initializeTests(int argc, char* argv[]) {
if (argc > 1)
platform.setPropertyDefaultValue("Precision", std::string(argv[1]));
if (argc > 2)
platform.setPropertyDefaultValue("DeviceIndex", std::string(argv[2]));
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2023 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "HipTests.h"
#include "TestATMForce.h"
void runPlatformTests() {
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2015 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "HipTests.h"
#include "TestAndersenThermostat.h"
void runPlatformTests() {
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2015 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "HipTests.h"
#include "TestBrownianIntegrator.h"
void runPlatformTests() {
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2015 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "HipTests.h"
#include "TestCMAPTorsionForce.h"
void runPlatformTests() {
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2015 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "HipTests.h"
#include "TestCMMotionRemover.h"
void runPlatformTests() {
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2012-2015 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "HipTests.h"
#include "TestCheckpoints.h"
void testCheckpoint() {
const int numParticles = 100;
const double boxSize = 5.0;
const double temperature = 200.0;
System system;
system.addForce(new AndersenThermostat(0.0, 100.0));
NonbondedForce* nonbonded = new NonbondedForce();
system.addForce(nonbonded);
nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
vector<Vec3> positions(numParticles);
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
for (int i = 0; i < numParticles; i++) {
system.addParticle(1.0);
nonbonded->addParticle(i%2 == 0 ? 0.1 : -0.1, 0.2, 0.1);
bool clash;
do {
clash = false;
positions[i] = Vec3(boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt), boxSize*genrand_real2(sfmt));
for (int j = 0; j < i; j++) {
Vec3 delta = positions[i]-positions[j];
if (sqrt(delta.dot(delta)) < 0.1)
clash = true;
}
} while (clash);
}
VerletIntegrator integrator(0.001);
Context context(system, integrator, platform);
context.setPositions(positions);
context.setPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
context.setParameter(AndersenThermostat::Temperature(), temperature);
// Run for a little while.
integrator.step(100);
// Record the current state and make a checkpoint.
State s1 = context.getState(State::Positions | State::Velocities | State::Parameters);
stringstream stream1(ios_base::out | ios_base::in | ios_base::binary);
context.createCheckpoint(stream1);
// Continue the simulation for a few more steps and record the state again.
integrator.step(10);
State s2 = context.getState(State::Positions | State::Velocities | State::Parameters);
// Restore from the checkpoint and see if everything gets restored correctly.
context.setPeriodicBoxVectors(Vec3(2*boxSize, 0, 0), Vec3(0, 2*boxSize, 0), Vec3(0, 0, 2*boxSize));
context.setParameter(AndersenThermostat::Temperature(), temperature+10);
context.loadCheckpoint(stream1);
State s3 = context.getState(State::Positions | State::Velocities | State::Parameters);
compareStates(s1, s3);
// Now simulate from there and see if the trajectory is identical.
integrator.step(10);
State s4 = context.getState(State::Positions | State::Velocities | State::Parameters);
compareStates(s2, s4);
// Create a new Context that uses multiple devices.
string deviceIndex = platform.getPropertyValue(context, HipPlatform::HipDeviceIndex());
map<string, string> props;
props[HipPlatform::HipDeviceIndex()] = deviceIndex+","+deviceIndex;
VerletIntegrator integrator2(0.001);
Context context2(system, integrator2, platform, props);
context2.setPositions(positions);
context2.setPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
context2.setParameter(AndersenThermostat::Temperature(), temperature);
// Now repeat all of the above tests with it.
integrator2.step(100);
State s5 = context2.getState(State::Positions | State::Velocities | State::Parameters);
stringstream stream2(ios_base::out | ios_base::in | ios_base::binary);
context2.createCheckpoint(stream2);
integrator2.step(10);
State s6 = context2.getState(State::Positions | State::Velocities | State::Parameters);
context2.setPeriodicBoxVectors(Vec3(2*boxSize, 0, 0), Vec3(0, 2*boxSize, 0), Vec3(0, 0, 2*boxSize));
context2.setParameter(AndersenThermostat::Temperature(), temperature+10);
context2.loadCheckpoint(stream2);
State s7 = context2.getState(State::Positions | State::Velocities | State::Parameters);
compareStates(s5, s7);
integrator2.step(10);
State s8 = context2.getState(State::Positions | State::Velocities | State::Parameters);
compareStates(s6, s8);
// See if a checkpoint created from one Context can be loaded into a different one.
VerletIntegrator integrator3(0.001);
Context context3(system, integrator3, platform);
stream1.seekg(0, stream1.beg);
context3.loadCheckpoint(stream1);
State s9 = context3.getState(State::Positions | State::Velocities | State::Parameters | State::Energy);
compareStates(s1, s9);
}
void runPlatformTests() {
testCheckpoint();
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2015 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "HipTests.h"
#include "TestCompoundIntegrator.h"
void runPlatformTests() {
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2015 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "HipTests.h"
#include "TestCustomAngleForce.h"
void testParallelComputation() {
System system;
const int numParticles = 200;
for (int i = 0; i < numParticles; i++)
system.addParticle(1.0);
CustomAngleForce* force = new CustomAngleForce("(theta-1.1)^2");
vector<double> params;
for (int i = 2; i < numParticles; i++)
force->addAngle(i-2, i-1, i, params);
system.addForce(force);
vector<Vec3> positions(numParticles);
for (int i = 0; i < numParticles; i++)
positions[i] = Vec3(i, i%2, 0);
VerletIntegrator integrator1(0.01);
Context context1(system, integrator1, platform);
context1.setPositions(positions);
State state1 = context1.getState(State::Forces | State::Energy);
VerletIntegrator integrator2(0.01);
string deviceIndex = platform.getPropertyValue(context1, HipPlatform::HipDeviceIndex());
map<string, string> props;
props[HipPlatform::HipDeviceIndex()] = deviceIndex+","+deviceIndex;
Context context2(system, integrator2, platform, props);
context2.setPositions(positions);
State state2 = context2.getState(State::Forces | State::Energy);
ASSERT_EQUAL_TOL(state1.getPotentialEnergy(), state2.getPotentialEnergy(), 1e-5);
for (int i = 0; i < numParticles; i++)
ASSERT_EQUAL_VEC(state1.getForces()[i], state2.getForces()[i], 1e-5);
}
void runPlatformTests() {
testParallelComputation();
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2015 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "HipTests.h"
#include "TestCustomBondForce.h"
void testParallelComputation() {
System system;
const int numParticles = 200;
for (int i = 0; i < numParticles; i++)
system.addParticle(1.0);
CustomBondForce* force = new CustomBondForce(("(r-1.1)^2"));
vector<double> params;
for (int i = 1; i < numParticles; i++)
force->addBond(i-1, i, params);
system.addForce(force);
vector<Vec3> positions(numParticles);
for (int i = 0; i < numParticles; i++)
positions[i] = Vec3(i, 0, 0);
VerletIntegrator integrator1(0.01);
Context context1(system, integrator1, platform);
context1.setPositions(positions);
State state1 = context1.getState(State::Forces | State::Energy);
VerletIntegrator integrator2(0.01);
string deviceIndex = platform.getPropertyValue(context1, HipPlatform::HipDeviceIndex());
map<string, string> props;
props[HipPlatform::HipDeviceIndex()] = deviceIndex+","+deviceIndex;
Context context2(system, integrator2, platform, props);
context2.setPositions(positions);
State state2 = context2.getState(State::Forces | State::Energy);
ASSERT_EQUAL_TOL(state1.getPotentialEnergy(), state2.getPotentialEnergy(), 1e-5);
for (int i = 0; i < numParticles; i++)
ASSERT_EQUAL_VEC(state1.getForces()[i], state2.getForces()[i], 1e-5);
}
void runPlatformTests() {
testParallelComputation();
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2023 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "HipTests.h"
#include "TestCustomCPPForce.h"
void runPlatformTests() {
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment