"...src/ssh:/git@developer.sourcefind.cn:2222/tsoc/openmm.git" did not exist on "cb92103e443a76e65a19f8afb40f5dcd6df0477e"
Commit 38f6c8f8 authored by Peter Eastman's avatar Peter Eastman
Browse files

Checked in Cuda code

parent 95d79181
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "gputypes.h"
// Initialization
extern void kClearForces(gpuContext gpu);
extern void kCalculateObcGbsaBornSum(gpuContext gpu);
extern void kReduceObcGbsaBornSum(gpuContext gpu);
extern void kGenerateRandoms(gpuContext gpu);
// Main loop
extern void kCalculateCDLJObcGbsaForces1(gpuContext gpu);
extern void kCalculateCDLJObcGbsaForces1_12(gpuContext gpu);
extern void kCalculateCDLJForces(gpuContext gpu);
extern void kCalculateCDLJForces_12(gpuContext gpu);
extern void kCalculateObcGbsaForces1(gpuContext gpu);
extern void kCalculateObcGbsaForces1_12(gpuContext gpu);
extern void kReduceObcGbsaBornForces(gpuContext gpu);
extern void kCalculateObcGbsaForces2(gpuContext gpu);
extern void kCalculateObcGbsaForces2_12(gpuContext gpu);
extern void kCalculateLocalForces(gpuContext gpu);
extern void kCalculateAndersenThermostat(gpuContext gpu);
extern void kReduceBornSumAndForces(gpuContext gpu);
extern void kUpdatePart1(gpuContext gpu);
extern void kApplyFirstShake(gpuContext gpu);
extern void kUpdatePart2(gpuContext gpu);
extern void kApplySecondShake(gpuContext gpu);
extern void kVerletUpdatePart1(gpuContext gpu);
extern void kVerletUpdatePart2(gpuContext gpu);
extern void kBrownianUpdatePart1(gpuContext gpu);
extern void kBrownianUpdatePart2(gpuContext gpu);
// Extras
extern void kReduceForces(gpuContext gpu);
extern void kClearBornForces(gpuContext gpu);
// Initializers
extern void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
extern void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
extern void SetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu);
extern void GetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu);
extern void SetCalculateCDLJForcesSim(gpuContext gpu);
extern void GetCalculateCDLJForcesSim(gpuContext gpu);
extern void SetCalculateCDLJForces_12Sim(gpuContext gpu);
extern void GetCalculateCDLJForces_12Sim(gpuContext gpu);
extern void SetCalculateLocalForcesSim(gpuContext gpu);
extern void GetCalculateLocalForcesSim(gpuContext gpu);
extern void SetCalculateObcGbsaBornSumSim(gpuContext gpu);
extern void GetCalculateObcGbsaBornSumSim(gpuContext gpu);
extern void SetCalculateObcGbsaForces1Sim(gpuContext gpu);
extern void GetCalculateObcGbsaForces1Sim(gpuContext gpu);
extern void SetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
extern void GetCalculateObcGbsaForces1_12Sim(gpuContext gpu);
extern void SetCalculateObcGbsaForces2Sim(gpuContext gpu);
extern void GetCalculateObcGbsaForces2Sim(gpuContext gpu);
extern void SetCalculateObcGbsaForces2_12Sim(gpuContext gpu);
extern void GetCalculateObcGbsaForces2_12Sim(gpuContext gpu);
extern void SetCalculateAndersenThermostatSim(gpuContext gpu);
extern void GetCalculateAndersenThermostatSim(gpuContext gpu);
extern void SetForcesSim(gpuContext gpu);
extern void GetForcesSim(gpuContext gpu);
extern void SetUpdateShakeHSim(gpuContext gpu);
extern void GetUpdateShakeHSim(gpuContext gpu);
extern void SetVerletUpdateSim(gpuContext gpu);
extern void GetVerletUpdateSim(gpuContext gpu);
extern void SetBrownianUpdateSim(gpuContext gpu);
extern void GetBrownianUpdateSim(gpuContext gpu);
extern void SetRandomSim(gpuContext gpu);
extern void GetRandomSim(gpuContext gpu);
#ifndef CUDATYPES_H
#define CUDATYPES_H
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdarg.h>
#include <limits>
#include <iostream>
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <builtin_types.h>
#include <vector_functions.h>
using namespace std;
#define RTERROR(status, s) \
if (status != cudaSuccess) { \
printf("%s %s\n", s, cudaGetErrorString(status)); \
exit(-1); \
}
#define LAUNCHERROR(s) \
{ \
cudaError_t status = cudaGetLastError(); \
if (status != cudaSuccess) { \
printf("Error: %s launching kernel %s\n", cudaGetErrorString(status), s); \
exit(-1); \
} \
}
// Pure virtual class to define an interface for objects resident both on GPU and CPU
struct SoADeviceObject {
virtual void Allocate() = 0;
virtual void Deallocate() = 0;
virtual void Upload() = 0;
virtual void Download() = 0;
};
template <typename T>
struct CUDAStream : public SoADeviceObject
{
unsigned int _length;
unsigned int _subStreams;
unsigned int _stride;
T** _pSysStream;
T** _pDevStream;
T* _pSysData;
T* _pDevData;
CUDAStream(int length, int subStreams = 1);
CUDAStream(unsigned int length, unsigned int subStreams = 1);
CUDAStream(unsigned int length, int subStreams = 1);
CUDAStream(int length, unsigned int subStreams = 1);
virtual ~CUDAStream();
void Allocate();
void Deallocate();
void Upload();
void Download();
void Collapse(unsigned int newstreams = 1, unsigned int interleave = 1);
};
float CompareStreams(CUDAStream<float>& s1, CUDAStream<float>& s2, float tolerance, unsigned int maxindex = 0);
template <typename T>
CUDAStream<T>::CUDAStream(int length, unsigned int subStreams) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0)
{
Allocate();
}
template <typename T>
CUDAStream<T>::CUDAStream(unsigned int length, int subStreams) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0)
{
Allocate();
}
template <typename T>
CUDAStream<T>::CUDAStream(unsigned int length, unsigned int subStreams) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0)
{
Allocate();
}
template <typename T>
CUDAStream<T>::CUDAStream(int length, int subStreams) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0)
{
Allocate();
}
template <typename T>
CUDAStream<T>::~CUDAStream()
{
Deallocate();
}
template <typename T>
void CUDAStream<T>::Allocate()
{
cudaError_t status;
_pSysStream = new T*[_subStreams];
_pDevStream = new T*[_subStreams];
_pSysData = new T[_subStreams * _stride];
status = cudaMalloc((void **) &_pDevData, _stride * _subStreams * sizeof(T));
RTERROR(status, "cudaMalloc CUDAStream::Allocate failed");
for (unsigned int i = 0; i < _subStreams; i++)
{
_pSysStream[i] = _pSysData + i * _stride;
_pDevStream[i] = _pDevData + i * _stride;
}
}
template <typename T>
void CUDAStream<T>::Deallocate()
{
cudaError_t status;
delete[] _pSysStream;
_pSysStream = NULL;
delete[] _pDevStream;
_pDevStream = NULL;
delete[] _pSysData;
_pSysData = NULL;
status = cudaFree(_pDevData);
RTERROR(status, "cudaFree CUDAStream::Deallocate failed");
}
template <typename T>
void CUDAStream<T>::Upload()
{
cudaError_t status;
status = cudaMemcpy(_pDevData, _pSysData, _stride * _subStreams * sizeof(T), cudaMemcpyHostToDevice);
RTERROR(status, "cudaMemcpy CUDAStream::Upload failed");
}
template <typename T>
void CUDAStream<T>::Download()
{
cudaError_t status;
status = cudaMemcpy(_pSysData, _pDevData, _stride * _subStreams * sizeof(T), cudaMemcpyDeviceToHost);
RTERROR(status, "cudaMemcpy CUDAStream::Download failed");
}
template <typename T>
void CUDAStream<T>::Collapse(unsigned int newstreams, unsigned int interleave)
{
T* pTemp = new T[_subStreams * _stride];
unsigned int stream = 0;
unsigned int pos = 0;
unsigned int newstride = _stride * _subStreams / newstreams;
unsigned int newlength = _length * _subStreams / newstreams;
// Copy data into new format
for (unsigned int i = 0; i < _length; i++)
{
for (unsigned int j = 0; j < _subStreams; j++)
{
pTemp[stream * newstride + pos] = _pSysStream[j][i];
stream++;
if (stream == newstreams)
{
stream = 0;
pos++;
}
}
}
// Remap stream pointers;
for (unsigned int i = 0; i < newstreams; i++)
{
_pSysStream[i] = _pSysData + i * newstride;
_pDevStream[i] = _pDevData + i * newstride;
}
// Copy data back intro original stream
for (unsigned int i = 0; i < newlength; i++)
for (unsigned int j = 0; j < newstreams; j++)
_pSysStream[j][i] = pTemp[j * newstride + i];
_stride = newstride;
_length = newlength;
_subStreams = newstreams;
delete[] pTemp;
}
static const int GRID = 32;
static const int GRIDBITS = 5;
static const int G8X_NONBOND_THREADS_PER_BLOCK = 256;
static const int GT2XX_NONBOND_THREADS_PER_BLOCK = 320;
static const int G8X_BORNFORCE2_THREADS_PER_BLOCK = 256;
static const int GT2XX_BORNFORCE2_THREADS_PER_BLOCK = 320;
static const int G8X_SHAKE_THREADS_PER_BLOCK = 128;
static const int GT2XX_SHAKE_THREADS_PER_BLOCK = 256;
static const int G8X_UPDATE_THREADS_PER_BLOCK = 192;
static const int GT2XX_UPDATE_THREADS_PER_BLOCK = 384;
static const int G8X_LOCALFORCES_THREADS_PER_BLOCK = 192;
static const int GT2XX_LOCALFORCES_THREADS_PER_BLOCK = 384;
static const int G8X_THREADS_PER_BLOCK = 256;
static const int GT2XX_THREADS_PER_BLOCK = 256;
static const int G8X_RANDOM_THREADS_PER_BLOCK = 256;
static const int GT2XX_RANDOM_THREADS_PER_BLOCK = 384;
static const int G8X_NONBOND_WORKUNITS_PER_SM = 220;
static const int GT2XX_NONBOND_WORKUNITS_PER_SM = 256;
struct cudaGmxSimulation {
// Constants
unsigned int atoms; // Number of atoms
unsigned int paddedNumberOfAtoms; // Padded number of atoms
unsigned int blocks; // Number of blocks to launch across linear kernels
unsigned int nonbond_blocks; // Number of blocks to launch across CDLJ and Born Force Part1
unsigned int bornForce2_blocks; // Number of blocks to launch across Born Force 2
unsigned int threads_per_block; // Threads per block to launch
unsigned int nonbond_threads_per_block; // Threads per block in nonbond kernel calls
unsigned int bornForce2_threads_per_block; // Threads per block in nonbond kernel calls
unsigned int max_update_threads_per_block; // Maximum threads per block in update kernel calls
unsigned int update_threads_per_block; // Threads per block in update kernel calls
unsigned int bf_reduce_threads_per_block; // Threads per block in Born Force reduction calls
unsigned int bsf_reduce_threads_per_block; // Threads per block in Born Sum And Forces reduction calls
unsigned int max_shake_threads_per_block; // Maximum threads per block in shake kernel calls
unsigned int shake_threads_per_block; // Threads per block in shake kernel calls
unsigned int nonshake_threads_per_block; // Threads per block in nonshaking kernel call
unsigned int max_localForces_threads_per_block; // Threads per block in local forces kernel calls
unsigned int localForces_threads_per_block; // Threads per block in local forces kernel calls
unsigned int random_threads_per_block; // Threads per block in RNG kernel calls
unsigned int workUnits; // Number of work units
unsigned int* pWorkUnit; // Pointer to work units
unsigned int nonbond_workBlock; // Number of work units running simultaneously per block in CDLJ and Born Force Part 1
unsigned int bornForce2_workBlock; // Number of work units running second half of Born Forces calculation
unsigned int workUnitsPerSM; // Number of workblocks per SM
unsigned int nbWorkUnitsPerBlock; // Number of work units assigned to each nonbond block
unsigned int nbWorkUnitsPerBlockRemainder; // Remainder of work units to assign across lower numbered nonbond blocks
unsigned int bf2WorkUnitsPerBlock; // Number of work units assigned to each bornForce2 block
unsigned int bf2WorkUnitsPerBlockRemainder; // Remainder of work units to assign across lower numbered bornForce2 blocks
unsigned int stride; // Atomic attributes stride
unsigned int stride2; // Atomic attributes stride x 2
unsigned int stride3; // Atomic attributes stride x 3
unsigned int stride4; // Atomic attributes stride x 4
unsigned int exclusionStride; // Exclusion list stride = stride / GRID
unsigned int nonbondOutputBuffers; // Nonbond output buffers per nonbond call
unsigned int totalNonbondOutputBuffers; // Total nonbond output buffers
unsigned int outputBuffers; // Number of output buffers
float bigFloat; // Floating point value used as a flag for Shaken atoms
float epsfac; // Epsilon factor for CDLJ calculations
float probeRadius; // SASA probe radius
float surfaceAreaFactor; // ACE approximation surface area factor
float electricConstant; // ACE approximation electric constant
float forceConversionFactor; // kJ to kcal force conversion factor
float preFactor; // Born electrostatic pre-factor
float dielectricOffset; // Born dielectric offset
float alphaOBC; // OBC alpha factor
float betaOBC; // OBC beta factor
float gammaOBC; // OBC gamma factor
float deltaT; // Molecular dynamics deltaT constant
float oneOverDeltaT; // 1/deltaT
float B; // Molecular dynamics B constant
float C; // Molecular dynamics C constant
float D; // Molecular dynamics D constant
float EPH; // Molecular dynamics EPH constant
float EMH; // Molecular dynamics EMH constant
float EM; // Molecular dynamics EM constant
float EP; // Molecular dynamics EP constant
float GDT; // Molecular dynamics GDT constant
float OneMinusEM; // Molecular dynamics OneMinusEM constant
float TauOneMinusEM; // Molecular dynamics TauOneMinusEM constant
float TauDOverEMMinusOne; // Molecular dynamics TauDOverEMMinusOne constant
float T; // Molecular dynamics T constant
float kT; // Boltzmann's constant times T
float V; // Molecular dynamics V constant
float X; // Molecular dynamics X constant
float Yv; // Molecular dynamics Yv constant
float Yx; // Molecular dynamics Yx constant
float tau; // Molecular dynamics tau constant
float fix1; // Molecular dynamics fix1 constant
float oneOverFix1; // Molecular dynamics reciprocal of fix1 constant
float DOverTauC; // Molecular dynamics DOverTauC constant
float collisionProbability; // Collision probability for Andersen thermostat
float2* pObcData; // Pointer to fixed Born data
float2* pAttr; // Pointer to additional atom attributes (sig, eps)
unsigned int bonds; // Number of bonds
int4* pBondID; // Bond atom and output buffer IDs
float2* pBondParameter; // Bond parameters
unsigned int bond_angles; // Number of bond angles
int4* pBondAngleID1; // Bond angle atom and first output buffer IDs
int2* pBondAngleID2; // Bond angle output buffer IDs
float2* pBondAngleParameter; // Bond angle parameters
unsigned int dihedrals; // Number of dihedrals
int4* pDihedralID1; // Dihedral IDs
int4* pDihedralID2; // Dihedral output buffer IDs
float4* pDihedralParameter; // Dihedral parameters
unsigned int rb_dihedrals; // Number of Ryckaert Bellemans dihedrals
int4* pRbDihedralID1; // Ryckaert Bellemans Dihedral IDs
int4* pRbDihedralID2; // Ryckaert Bellemans Dihedral output buffer IDs
float4* pRbDihedralParameter1; // Ryckaert Bellemans Dihedral parameters
float2* pRbDihedralParameter2; // Ryckaert Bellemans Dihedral parameters
unsigned int LJ14s; // Number of Lennard Jones 1-4 interactions
int4* pLJ14ID; // Lennard Jones 1-4 atom and output buffer IDs
float4* pLJ14Parameter; // Lennard Jones 1-4 parameters
float inverseTotalMass; // Used in linear momentum removal
unsigned int ShakeConstraints; // Total number of Shake constraints
unsigned int NonShakeConstraints; // Total number of NonShake atoms
unsigned int maxShakeIterations; // Maximum shake iterations
unsigned int degreesOfFreedom; // Number of degrees of freedom in system
float shakeTolerance; // Shake tolerance
float InvMassJ; // Shake inverse mass for hydrogens
int* pNonShakeID; // Not Shaking atoms
int4* pShakeID; // Shake atoms and phase
float4* pShakeParameter; // Shake parameters
unsigned int* pExclusion; // Nonbond exclusion data
unsigned int bond_offset; // Offset to end of bonds
unsigned int bond_angle_offset; // Offset to end of bond angles
unsigned int dihedral_offset; // Offset to end of dihedrals
unsigned int rb_dihedral_offset; // Offset to end of Ryckaert Bellemans dihedrals
unsigned int LJ14_offset; // Offset to end of Lennard Jones 1-4 parameters
// Mutable stuff
float4* pPosq; // Pointer to atom positions and charges
float4* pPosqP; // Pointer to mid-integration atom positions
float4* pOldPosq; // Pointer to old atom positions
float4* pVelm4; // Pointer to atom velocity and inverse mass
float4* pvVector4; // Pointer to atom v Vector
float4* pxVector4; // Pointer to atom x Vector
float4* pForce4; // Pointer to all force4 data
float4* pForce4a; // Pointer to first set of force4 data
float4* pForce4b; // Pointer to second set of force4 data
float4* pOutForce4; // Pointer to output float4 force
float* pBornForce; // Pointer to Born force data
float* pBornSum; // Pointer to Born Radii calculation output buffers
float* pBornRadii; // Pointer to Born Radii
float* pObcChain; // Pointer to OBC chain data
float4* pLinearMomentum; // Pointer to linear momentum
// Random numbers
float4* pRandom4a; // Pointer to first set of 4 random numbers
float4* pRandom4b; // Pointer to second set of 4 random numbers
float2* pRandom2a; // Pointer to first set of 2 random numbers
float2* pRandom2b; // Pointer to second set of 2 random numbers
uint4* pRandomSeed; // Pointer to random seeds
int* pRandomPosition; // Pointer to random number positions
unsigned int randoms; // Number of randoms
unsigned int totalRandoms; // Number of randoms plus overflow.
unsigned int totalRandomsTimesTwo; // Used for generating randoms
unsigned int randomIterations; // Number of iterations before regenerating randoms
unsigned int randomFrames; // Number of frames of random numbers
};
struct Vectors {
float3 v0;
float3 v1;
float3 v2;
};
#endif
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
#include <sstream>
#include <ctime>
#include <cmath>
#include <map>
#ifdef WIN32
#include <windows.h>
#else
#include <stdint.h>
#endif
using namespace std;
#include "gputypes.h"
#include "cudaKernels.h"
#include "OpenMMException.h"
using OpenMM::OpenMMException;
#ifdef WIN32
typedef unsigned __int64 u64;
typedef signed __int64 s64;
#else
typedef uint64_t u64;
typedef int64_t s64;
#endif
typedef unsigned int u32;
typedef float f32;
typedef double f64;
typedef char ascii;
typedef char utf8;
typedef unsigned char u8;
typedef signed char s8;
typedef unsigned short u16;
typedef signed short s16;
typedef struct
{
u8 type[4];
f32 charge;
f32 radius;
} FAH_ATOM;
typedef struct
{
u32 a; /* rule: a < b */
u32 b;
} FAH_BOND;
typedef struct
{
f32 x;
f32 y;
f32 z;
} FAH_XYZ;
typedef struct
{
u32 magic;
u32 version;
utf8 name[64];
s64 timestamp;
u64 iterations;
u32 frames;
u32 atom_count;
u32 bond_count;
/* v2 */
utf8 user_name[64];
utf8 user_team[16];
utf8 user_done[16];
} FAH_INFO;
typedef struct
{
u32 magic;
u32 version;
s64 timestamp;
u64 iterations_done;
u32 frames_done;
f32 energy;
f32 temperature;
} FAH_CURRENT;
typedef struct
{
FAH_INFO info;
FAH_CURRENT current;
FAH_ATOM * atoms;
FAH_BOND * bonds;
FAH_XYZ * xyz;
} PROTEIN;
struct ShakeCluster {
int centralID;
int peripheralID[3];
int size;
float distance;
float centralInvMass, peripheralInvMass;
ShakeCluster() {
}
ShakeCluster(int centralID, float invMass) : centralID(centralID), centralInvMass(invMass), size(0) {
}
void addAtom(int id, float dist, float invMass) {
if (size == 3)
throw OpenMMException("A single atom may only have three constraints");
if (size > 0 && dist != distance)
throw OpenMMException("All constraints for a central atom must have the same distance");
if (size > 0 && invMass != peripheralInvMass)
throw OpenMMException("All constraints for a central atom must have the same mass");
peripheralID[size++] = id;
distance = dist;
peripheralInvMass = invMass;
}
};
static const float dielectricOffset = 0.009f;
static const float PI = 3.1415926535f;
static const float probeRadius = 0.14f;
static const float forceConversionFactor = 0.4184f;
//static const float surfaceAreaFactor = -6.0f * 0.06786f * forceConversionFactor * 1000.0f; // PI * 4.0f * 0.0049f * 1000.0f;
//static const float surfaceAreaFactor = -6.0f * PI * 4.0f * 0.0049f * 1000.0f;
static const float surfaceAreaFactor = -6.0f*PI*0.0216f*1000.0f*0.4184f;
//static const float surfaceAreaFactor = -1.7035573959e+001;
//static const float surfaceAreaFactor = -166.02691f;
//static const float surfaceAreaFactor = 1.0f;
static const float alphaOBC = 1.0f;
static const float betaOBC = 0.8f;
static const float gammaOBC = 4.85f;
static const float kcalMolTokJNM = -0.4184f;
static const float electricConstant = -166.02691f;
static const float defaultInnerDielectric = 1.0f;
static const float defaultSolventDielectric = 78.3f;
static const float KILO = 1e3; // Thousand
static const float BOLTZMANN = 1.380658e-23f; // (J/K)
static const float AVOGADRO = 6.0221367e23f; // ()
static const float RGAS = BOLTZMANN * AVOGADRO; // (J/(mol K))
static const float BOLTZ = (RGAS / KILO); // (kJ/(mol K))
#define DUMP_PARAMETERS 0
#define DeltaShake
extern "C"
int gpuReadBondParameters(gpuContext gpu, char* fname)
{
ifstream infile(fname);
if (!infile.fail())
{
char buff[512];
int bonds;
infile >> bonds;
infile.getline(buff, 512);
vector<int> atom1(bonds);
vector<int> atom2(bonds);
vector<float> length(bonds);
vector<float> k(bonds);
for (int i = 0; i < bonds; i++)
{
int junk;
infile >>
junk >>
atom1[i] >>
atom2[i] >>
length[i] >>
k[i];
}
gpuSetBondParameters(gpu, atom1, atom2, length, k);
return bonds;
}
else
{
cout << "Error opening harmonic bond parameter file " << fname << endl;
exit(-1);
}
return 0;
}
extern "C"
void gpuSetBondParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<float>& length, const vector<float>& k)
{
int bonds = atom1.size();
gpu->sim.bonds = bonds;
CUDAStream<int4>* psBondID = new CUDAStream<int4>(bonds, 1);
gpu->psBondID = psBondID;
gpu->sim.pBondID = psBondID->_pDevStream[0];
CUDAStream<float2>* psBondParameter = new CUDAStream<float2>(bonds, 1);
gpu->psBondParameter = psBondParameter;
gpu->sim.pBondParameter = psBondParameter->_pDevStream[0];
for (int i = 0; i < bonds; i++)
{
psBondID->_pSysStream[0][i].x = atom1[i];
psBondID->_pSysStream[0][i].y = atom2[i];
psBondParameter->_pSysStream[0][i].x = length[i];
psBondParameter->_pSysStream[0][i].y = k[i];
psBondID->_pSysStream[0][i].z = gpu->pOutputBufferCounter[psBondID->_pSysStream[0][i].x]++;
psBondID->_pSysStream[0][i].w = gpu->pOutputBufferCounter[psBondID->_pSysStream[0][i].y]++;
#if (DUMP_PARAMETERS == 1)
cout <<
i << " " <<
psBondID->_pSysStream[0][i].x << " " <<
psBondID->_pSysStream[0][i].y << " " <<
psBondID->_pSysStream[0][i].z << " " <<
psBondID->_pSysStream[0][i].w << " " <<
psBondParameter->_pSysStream[0][i].x << " " <<
psBondParameter->_pSysStream[0][i].y <<
endl;
#endif
}
psBondID->Upload();
psBondParameter->Upload();
}
extern "C"
int gpuReadBondAngleParameters(gpuContext gpu, char* fname)
{
ifstream infile(fname);
if (!infile.fail())
{
char buff[512];
int bond_angles;
infile >> bond_angles;
infile.getline(buff, 512);
vector<int> atom1(bond_angles);
vector<int> atom2(bond_angles);
vector<int> atom3(bond_angles);
vector<float> angle(bond_angles);
vector<float> k(bond_angles);
for (int i = 0; i < bond_angles; i++)
{
int junk;
infile >>
junk >>
atom1[i] >>
atom2[i] >>
atom3[i] >>
angle[i] >>
k[i];
}
gpuSetBondAngleParameters(gpu, atom1, atom2, atom3, angle, k);
return bond_angles;
}
else
{
cout << "Error opening harmonic bond angle parameter file " << fname << endl;
exit(-1);
}
return 0;
}
extern "C"
void gpuSetBondAngleParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<int>& atom3,
const vector<float>& angle, const vector<float>& k)
{
int bond_angles = atom1.size();
gpu->sim.bond_angles = bond_angles;
CUDAStream<int4>* psBondAngleID1 = new CUDAStream<int4>(bond_angles, 1);
gpu->psBondAngleID1 = psBondAngleID1;
gpu->sim.pBondAngleID1 = psBondAngleID1->_pDevStream[0];
CUDAStream<int2>* psBondAngleID2 = new CUDAStream<int2>(bond_angles, 1);
gpu->psBondAngleID2 = psBondAngleID2;
gpu->sim.pBondAngleID2 = psBondAngleID2->_pDevStream[0];
CUDAStream<float2>* psBondAngleParameter = new CUDAStream<float2>(bond_angles, 1);
gpu->psBondAngleParameter = psBondAngleParameter;
gpu->sim.pBondAngleParameter = psBondAngleParameter->_pDevStream[0];
for (int i = 0; i < bond_angles; i++)
{
psBondAngleID1->_pSysStream[0][i].x = atom1[i];
psBondAngleID1->_pSysStream[0][i].y = atom2[i];
psBondAngleID1->_pSysStream[0][i].z = atom3[i];
psBondAngleParameter->_pSysStream[0][i].x = angle[i];
psBondAngleParameter->_pSysStream[0][i].y = k[i];
psBondAngleID1->_pSysStream[0][i].w = gpu->pOutputBufferCounter[psBondAngleID1->_pSysStream[0][i].x]++;
psBondAngleID2->_pSysStream[0][i].x = gpu->pOutputBufferCounter[psBondAngleID1->_pSysStream[0][i].y]++;
psBondAngleID2->_pSysStream[0][i].y = gpu->pOutputBufferCounter[psBondAngleID1->_pSysStream[0][i].z]++;
#if (DUMP_PARAMETERS == 1)
cout <<
i << " " <<
psBondAngleID1->_pSysStream[0][i].x << " " <<
psBondAngleID1->_pSysStream[0][i].y << " " <<
psBondAngleID1->_pSysStream[0][i].z << " " <<
psBondAngleID1->_pSysStream[0][i].w << " " <<
psBondAngleID2->_pSysStream[0][i].x << " " <<
psBondAngleID2->_pSysStream[0][i].y << " " <<
psBondAngleParameter->_pSysStream[0][i].x << " " <<
psBondAngleParameter->_pSysStream[0][i].y <<
endl;
#endif
}
psBondAngleID1->Upload();
psBondAngleID2->Upload();
psBondAngleParameter->Upload();
}
extern "C"
int gpuReadDihedralParameters(gpuContext gpu, char* fname)
{
ifstream infile(fname);
if (!infile.fail())
{
char buff[512];
int dihedrals;
infile >> dihedrals;
infile.getline(buff, 512);
vector<int> atom1(dihedrals);
vector<int> atom2(dihedrals);
vector<int> atom3(dihedrals);
vector<int> atom4(dihedrals);
vector<float> k(dihedrals);
vector<float> phase(dihedrals);
vector<int> periodicity(dihedrals);
for (int i = 0; i < dihedrals; i++)
{
int junk;
infile >>
junk >>
atom1[i] >>
atom2[i] >>
atom3[i] >>
atom4[i] >>
k[i] >>
phase[i] >>
periodicity[i];
}
gpuSetDihedralParameters(gpu, atom1, atom2, atom3, atom4, k, phase, periodicity);
return dihedrals;
}
else
{
cout << "Error opening dihedral parameter file " << fname << endl;
exit(-1);
}
return 0;
}
extern "C"
void gpuSetDihedralParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<int>& atom3, const vector<int>& atom4,
const vector<float>& k, const vector<float>& phase, const vector<int>& periodicity)
{
int dihedrals = atom1.size();
gpu->sim.dihedrals = dihedrals;
CUDAStream<int4>* psDihedralID1 = new CUDAStream<int4>(dihedrals, 1);
gpu->psDihedralID1 = psDihedralID1;
gpu->sim.pDihedralID1 = psDihedralID1->_pDevStream[0];
CUDAStream<int4>* psDihedralID2 = new CUDAStream<int4>(dihedrals, 1);
gpu->psDihedralID2 = psDihedralID2;
gpu->sim.pDihedralID2 = psDihedralID2->_pDevStream[0];
CUDAStream<float4>* psDihedralParameter = new CUDAStream<float4>(dihedrals, 1);
gpu->psDihedralParameter = psDihedralParameter;
gpu->sim.pDihedralParameter = psDihedralParameter->_pDevStream[0];
for (int i = 0; i < dihedrals; i++)
{
psDihedralID1->_pSysStream[0][i].x = atom1[i];
psDihedralID1->_pSysStream[0][i].y = atom2[i];
psDihedralID1->_pSysStream[0][i].z = atom3[i];
psDihedralID1->_pSysStream[0][i].w = atom4[i];
psDihedralParameter->_pSysStream[0][i].x = k[i];
psDihedralParameter->_pSysStream[0][i].y = phase[i];
psDihedralParameter->_pSysStream[0][i].z = (float) periodicity[i];
psDihedralID2->_pSysStream[0][i].x = gpu->pOutputBufferCounter[psDihedralID1->_pSysStream[0][i].x]++;
psDihedralID2->_pSysStream[0][i].y = gpu->pOutputBufferCounter[psDihedralID1->_pSysStream[0][i].y]++;
psDihedralID2->_pSysStream[0][i].z = gpu->pOutputBufferCounter[psDihedralID1->_pSysStream[0][i].z]++;
psDihedralID2->_pSysStream[0][i].w = gpu->pOutputBufferCounter[psDihedralID1->_pSysStream[0][i].w]++;
#if (DUMP_PARAMETERS == 1)
cout <<
i << " " <<
psDihedralID1->_pSysStream[0][i].x << " " <<
psDihedralID1->_pSysStream[0][i].y << " " <<
psDihedralID1->_pSysStream[0][i].z << " " <<
psDihedralID1->_pSysStream[0][i].w << " " <<
psDihedralID2->_pSysStream[0][i].x << " " <<
psDihedralID2->_pSysStream[0][i].y << " " <<
psDihedralID2->_pSysStream[0][i].z << " " <<
psDihedralID2->_pSysStream[0][i].w << " " <<
psDihedralParameter->_pSysStream[0][i].x << " " <<
psDihedralParameter->_pSysStream[0][i].y << " " <<
psDihedralParameter->_pSysStream[0][i].z << endl;
#endif
}
psDihedralID1->Upload();
psDihedralID2->Upload();
psDihedralParameter->Upload();
}
extern "C"
int gpuReadRbDihedralParameters(gpuContext gpu, char* fname)
{
ifstream infile(fname);
if (!infile.fail())
{
char buff[512];
int rb_dihedrals;
infile >> rb_dihedrals;
infile.getline(buff, 512);
vector<int> atom1(rb_dihedrals);
vector<int> atom2(rb_dihedrals);
vector<int> atom3(rb_dihedrals);
vector<int> atom4(rb_dihedrals);
vector<float> c0(rb_dihedrals);
vector<float> c1(rb_dihedrals);
vector<float> c2(rb_dihedrals);
vector<float> c3(rb_dihedrals);
vector<float> c4(rb_dihedrals);
vector<float> c5(rb_dihedrals);
gpu->sim.rb_dihedrals = rb_dihedrals;
CUDAStream<int4>* psRbDihedralID1 = new CUDAStream<int4>(rb_dihedrals, 1);
gpu->psRbDihedralID1 = psRbDihedralID1;
gpu->sim.pRbDihedralID1 = psRbDihedralID1->_pDevStream[0];
CUDAStream<int4>* psRbDihedralID2 = new CUDAStream<int4>(rb_dihedrals, 1);
gpu->psRbDihedralID2 = psRbDihedralID2;
gpu->sim.pRbDihedralID2 = psRbDihedralID2->_pDevStream[0];
CUDAStream<float4>* psRbDihedralParameter1 = new CUDAStream<float4>(rb_dihedrals, 1);
gpu->psRbDihedralParameter1 = psRbDihedralParameter1;
gpu->sim.pRbDihedralParameter1 = psRbDihedralParameter1->_pDevStream[0];
CUDAStream<float2>* psRbDihedralParameter2 = new CUDAStream<float2>(rb_dihedrals, 1);
gpu->psRbDihedralParameter2 = psRbDihedralParameter2;
gpu->sim.pRbDihedralParameter2 = psRbDihedralParameter2->_pDevStream[0];
for (int i = 0; i < rb_dihedrals; i++)
{
int junk;
infile >>
junk >>
atom1[i] >>
atom2[i] >>
atom3[i] >>
atom4[i] >>
c0[i] >>
c1[i] >>
c2[i] >>
c3[i] >>
c4[i] >>
c5[i];
}
gpuSetRbDihedralParameters(gpu, atom1, atom2, atom3, atom4, c0, c1, c2, c3, c4, c5);
return rb_dihedrals;
}
else
{
cout << "Error opening Ryckaert-Bellemans dihedral parameter file " << fname << endl;
exit(-1);
}
return 0;
}
extern "C"
void gpuSetRbDihedralParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<int>& atom3, const vector<int>& atom4,
const vector<float>& c0, const vector<float>& c1, const vector<float>& c2, const vector<float>& c3, const vector<float>& c4, const vector<float>& c5)
{
int rb_dihedrals = atom1.size();
gpu->sim.rb_dihedrals = rb_dihedrals;
CUDAStream<int4>* psRbDihedralID1 = new CUDAStream<int4>(rb_dihedrals, 1);
gpu->psRbDihedralID1 = psRbDihedralID1;
gpu->sim.pRbDihedralID1 = psRbDihedralID1->_pDevStream[0];
CUDAStream<int4>* psRbDihedralID2 = new CUDAStream<int4>(rb_dihedrals, 1);
gpu->psRbDihedralID2 = psRbDihedralID2;
gpu->sim.pRbDihedralID2 = psRbDihedralID2->_pDevStream[0];
CUDAStream<float4>* psRbDihedralParameter1 = new CUDAStream<float4>(rb_dihedrals, 1);
gpu->psRbDihedralParameter1 = psRbDihedralParameter1;
gpu->sim.pRbDihedralParameter1 = psRbDihedralParameter1->_pDevStream[0];
CUDAStream<float2>* psRbDihedralParameter2 = new CUDAStream<float2>(rb_dihedrals, 1);
gpu->psRbDihedralParameter2 = psRbDihedralParameter2;
gpu->sim.pRbDihedralParameter2 = psRbDihedralParameter2->_pDevStream[0];
for (int i = 0; i < rb_dihedrals; i++)
{
psRbDihedralID1->_pSysStream[0][i].x = atom1[i];
psRbDihedralID1->_pSysStream[0][i].y = atom2[i];
psRbDihedralID1->_pSysStream[0][i].z = atom3[i];
psRbDihedralID1->_pSysStream[0][i].w = atom4[i];
psRbDihedralParameter1->_pSysStream[0][i].x = c0[i];
psRbDihedralParameter1->_pSysStream[0][i].y = c1[i];
psRbDihedralParameter1->_pSysStream[0][i].z = c2[i];
psRbDihedralParameter1->_pSysStream[0][i].w = c3[i];
psRbDihedralParameter2->_pSysStream[0][i].x = c4[i];
psRbDihedralParameter2->_pSysStream[0][i].y = c5[i];
psRbDihedralID2->_pSysStream[0][i].x = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysStream[0][i].x]++;
psRbDihedralID2->_pSysStream[0][i].y = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysStream[0][i].y]++;
psRbDihedralID2->_pSysStream[0][i].z = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysStream[0][i].z]++;
psRbDihedralID2->_pSysStream[0][i].w = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysStream[0][i].w]++;
#if (DUMP_PARAMETERS == 1)
cout <<
i << " " <<
psRbDihedralID1->_pSysStream[0][i].x << " " <<
psRbDihedralID1->_pSysStream[0][i].y << " " <<
psRbDihedralID1->_pSysStream[0][i].z << " " <<
psRbDihedralID1->_pSysStream[0][i].w <<" " <<
psRbDihedralID2->_pSysStream[0][i].x << " " <<
psRbDihedralID2->_pSysStream[0][i].y << " " <<
psRbDihedralID2->_pSysStream[0][i].z << " " <<
psRbDihedralID2->_pSysStream[0][i].w <<" " <<
psRbDihedralParameter1->_pSysStream[0][i].x << " " <<
psRbDihedralParameter1->_pSysStream[0][i].y << " " <<
psRbDihedralParameter1->_pSysStream[0][i].z << " " <<
psRbDihedralParameter1->_pSysStream[0][i].w << " " <<
psRbDihedralParameter2->_pSysStream[0][i].x << " " <<
psRbDihedralParameter2->_pSysStream[0][i].y <<
endl;
#endif
}
psRbDihedralID1->Upload();
psRbDihedralID2->Upload();
psRbDihedralParameter1->Upload();
psRbDihedralParameter2->Upload();
}
extern "C"
int gpuReadLJ14Parameters(gpuContext gpu, char* fname)
{
ifstream infile(fname);
if (!infile.fail())
{
char buff[1024];
float epsfac = 0.0f;
float fudge = 0.0f;
int LJ14s;
infile >> LJ14s;
infile.get(buff, 61);
// cout << buff << endl;
infile >> epsfac;
infile.get(buff, 8);
infile >> fudge;
infile.getline(buff, 512);
// cout << buff << endl;
vector<int> atom1(LJ14s);
vector<int> atom2(LJ14s);
vector<float> c6(LJ14s);
vector<float> c12(LJ14s);
vector<float> q1(LJ14s);
vector<float> q2(LJ14s);
for (int i = 0; i < LJ14s; i++)
{
int junk;
infile >>
junk >>
atom1[i] >>
atom2[i] >>
c6[i] >>
c12[i] >>
q1[i] >>
q2[i];
}
gpuSetLJ14Parameters(gpu, epsfac, fudge, atom1, atom2, c6, c12, q1, q2);
return LJ14s;
}
else
{
cout << "Error opening Lennard-Jones 1-4 parameter file " << fname << endl;
exit(-1);
}
return 0;
}
extern "C"
void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const vector<int>& atom1, const vector<int>& atom2,
const vector<float>& c6, const vector<float>& c12, const vector<float>& q1, const vector<float>& q2)
{
int LJ14s = atom1.size();
float scale = epsfac * fudge;
gpu->sim.LJ14s = LJ14s;
CUDAStream<int4>* psLJ14ID = new CUDAStream<int4>(LJ14s, 1);
gpu->psLJ14ID = psLJ14ID;
gpu->sim.pLJ14ID = psLJ14ID->_pDevStream[0];
CUDAStream<float4>* psLJ14Parameter = new CUDAStream<float4>(LJ14s, 1);
gpu->psLJ14Parameter = psLJ14Parameter;
gpu->sim.pLJ14Parameter = psLJ14Parameter->_pDevStream[0];
for (int i = 0; i < LJ14s; i++)
{
psLJ14ID->_pSysStream[0][i].x = atom1[i];
psLJ14ID->_pSysStream[0][i].y = atom2[i];
psLJ14ID->_pSysStream[0][i].z = gpu->pOutputBufferCounter[psLJ14ID->_pSysStream[0][i].x]++;
psLJ14ID->_pSysStream[0][i].w = gpu->pOutputBufferCounter[psLJ14ID->_pSysStream[0][i].y]++;
float p0, p1, p2;
if (c12[i] == 0.0f)
{
p0 = 0.0f;
p1 = 1.0f;
}
else
{
p0 = c6[i] * c6[i] / c12[i];
p1 = pow(c12[i] / c6[i], 1.0f / 6.0f);
}
p2 = scale * q1[i] * q2[i];
psLJ14Parameter->_pSysStream[0][i].x = p0;
psLJ14Parameter->_pSysStream[0][i].y = p1;
psLJ14Parameter->_pSysStream[0][i].z = p2;
}
#if (DUMP_PARAMETERS == 1)
cout <<
i << " " <<
psLJ14ID->_pSysStream[0][i].x << " " <<
psLJ14ID->_pSysStream[0][i].y << " " <<
psLJ14ID->_pSysStream[0][i].z << " " <<
psLJ14ID->_pSysStream[0][i].w << " " <<
psLJ14Parameter->_pSysStream[0][i].x << " " <<
psLJ14Parameter->_pSysStream[0][i].y << " " <<
psLJ14Parameter->_pSysStream[0][i].z << " " <<
p0 << " " <<
p1 << " " <<
p2 << " " <<
endl;
#endif
psLJ14ID->Upload();
psLJ14Parameter->Upload();
}
extern "C"
float gpuGetAtomicRadius(gpuContext gpu, string s)
{
for (int i = 0; i < gpu->gAtomTypes; i++)
{
if (s == gpu->gpAtomTable[i].name)
{
return gpu->gpAtomTable[i].r;
}
}
return 0.0f;
}
extern "C"
unsigned char gpuGetAtomicSymbol(gpuContext gpu, string s)
{
for (int i = 0; i < gpu->gAtomTypes; i++)
{
if (s == gpu->gpAtomTable[i].name)
{
return gpu->gpAtomTable[i].symbol;
}
}
return ' ';
}
extern "C"
int gpuReadAtomicParameters(gpuContext gpu, char* fname)
{
gpu->gAtomTypes = 0;
if (gpu->gpAtomTable)
delete[] gpu->gpAtomTable;
// Read file once to count atom types
ifstream infile(fname);
if (!infile.fail())
{
char buff[1024];
int skips = 0;
bool skipflag = true;
while (infile.getline(buff, 512))
{
if (buff[0] == ' ')
{
skipflag = false;
gpu->gAtomTypes++;
}
else if (skipflag)
skips++;
}
infile.close();
gpu->gpAtomTable = new gpuAtomType[gpu->gAtomTypes];
ifstream infile1(fname);
for (int i = 0; i < skips; i++)
{
infile1.getline(buff, 512);
}
for (int i = 0; i < gpu->gAtomTypes; i++)
{
infile1 >> gpu->gpAtomTable[i].name >> gpu->gpAtomTable[i].r;
infile1.getline(buff, 512);
// Determine symbol
if (gpu->gpAtomTable[i].r < 1.3f)
gpu->gpAtomTable[i].symbol = 'H';
else if (gpu->gpAtomTable[i].r < 1.6f)
gpu->gpAtomTable[i].symbol = 'O';
else if (gpu->gpAtomTable[i].r < 1.7f)
gpu->gpAtomTable[i].symbol = 'N';
else
gpu->gpAtomTable[i].symbol = 'C';
#if (DUMP_PARAMETERS == 1)
cout << i << " " << gpu->gpAtomTable[i].name << " " << gpu->gpAtomTable[i].symbol << " " << gpu->gpAtomTable[i].r << endl;
#endif
}
return gpu->gAtomTypes;
}
else
{
cout << "Error opening atom parameter file " << fname << endl;
exit(-1);
}
return 0;
}
extern "C"
int gpuReadCoulombParameters(gpuContext gpu, char* fname)
{
ifstream infile(fname);
if (!infile.fail())
{
char buff[1024];
unsigned int coulombs;
float fudge = 0.0f;
float epsfac = 1.0f;
infile >> coulombs;
infile.get(buff, 9);
infile >> epsfac;
infile.get(buff, 8);
infile >> fudge;
infile.getline(buff, 512);
vector<int> atom(coulombs);
vector<float> c6(coulombs);
vector<float> c12(coulombs);
vector<float> q(coulombs);
vector<float> radius(coulombs);
vector<float> scale(coulombs);
vector<char> symbol(coulombs);
vector<vector<int> > exclusions(coulombs);
unsigned int total_exclusions = 0;
for (unsigned int i = 0; i < coulombs; i++)
{
int junk, numExclusions;
char atype[512];
infile >>
junk >>
c6[i] >>
c12[i] >>
q[i] >>
atype >>
scale[i] >>
numExclusions;
radius[i] = gpuGetAtomicRadius(gpu, atype);
symbol[i] = gpuGetAtomicSymbol(gpu, atype);
for (int j = 0; j < numExclusions; j++)
{
int exclusion;
infile >> exclusion;
exclusions[i].push_back(exclusion);
}
}
cout << total_exclusions << " total exclusions.\n";
gpuSetCoulombParameters(gpu, epsfac, atom, c6, c12, q, symbol, exclusions);
gpuSetObcParameters(gpu, defaultInnerDielectric, defaultSolventDielectric, atom, radius, scale);
return coulombs;
}
else
{
cout << "Error opening Coulomb parameter file " << fname << endl;
exit(-1);
}
return 0;
}
extern "C"
void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& atom, const vector<float>& c6, const vector<float>& c12, const vector<float>& q,
const vector<char>& symbol, const vector<vector<int> >& exclusions)
{
unsigned int coulombs = atom.size();
gpu->sim.epsfac = epsfac;
unsigned int total_exclusions = 0;
for (unsigned int i = 0; i < coulombs; i++)
{
float p0 = q[i];
float p1 = 0.5f, p2 = 0.0f;
if ((c6[i] > 0.0f) && (c12[i] > 0.0f))
{
p1 = 0.5f * pow(c12[i] / c6[i], 1.0f / 6.0f);
p2 = c6[i] * sqrt(1.0f / c12[i]);
}
if (symbol.size() > 0)
gpu->pAtomSymbol[i] = symbol[i];
gpu->psPosq4->_pSysStream[0][i].w = p0;
gpu->psSigEps2->_pSysStream[0][i].x = p1;
gpu->psSigEps2->_pSysStream[0][i].y = p2;
#if (DUMP_PARAMETERS == 1)
cout <<
i << " " <<
gpu->psPosq4->_pSysStream[0][i].w << " " <<
gpu->psSigEps2->_pSysStream[0][i].x << " " <<
gpu->psSigEps2->_pSysStream[0][i].y << " " <<
p0 << " " <<
p1 << " " <<
p2 << " " <<
exclusions;
#endif
for (int j = 0; j < (int) exclusions[i].size(); j++)
{
#if (DUMP_PARAMETERS == 1)
cout << " " << exclusions[i][j];
#endif
gpu->pExclusion[i * gpu->sim.paddedNumberOfAtoms + exclusions[i][j]] = 0;
if (i >= (int) exclusions[i][j])
{
total_exclusions++;
}
}
#if (DUMP_PARAMETERS == 1)
cout << endl;
#endif
}
// Dummy out extra atom data
for (unsigned int i = coulombs; i < gpu->sim.paddedNumberOfAtoms; i++)
{
gpu->psPosq4->_pSysStream[0][i].x = 100000.0f + i * 10.0f;
gpu->psPosq4->_pSysStream[0][i].y = 100000.0f + i * 10.0f;
gpu->psPosq4->_pSysStream[0][i].z = 100000.0f + i * 10.0f;
gpu->psPosq4->_pSysStream[0][i].w = 0.0f;
gpu->psSigEps2->_pSysStream[0][i].x = 0.0f;
gpu->psSigEps2->_pSysStream[0][i].y = 0.0f;
}
// Add in remaining exclusions
for (unsigned int i = coulombs; i < gpu->sim.paddedNumberOfAtoms; i++)
{
for (unsigned int j = 0; j < gpu->sim.paddedNumberOfAtoms; j++)
{
gpu->pExclusion[i * gpu->sim.paddedNumberOfAtoms + j] = 0;
gpu->pExclusion[j * gpu->sim.paddedNumberOfAtoms + i] = 0;
}
}
gpu->psPosq4->Upload();
gpu->psSigEps2->Upload();
// Check for exclusion consistency
for (unsigned int i = 0; i < coulombs; i++)
{
for (unsigned int j = i; j < coulombs; j++)
{
if (gpu->pExclusion[i * gpu->sim.paddedNumberOfAtoms + j] != gpu->pExclusion[j * gpu->sim.paddedNumberOfAtoms + i])
cout << "Warning: inconsistent exclusion betweens atoms " << i << " and " << j << endl;
}
}
}
extern "C"
void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const vector<int>& atom, const vector<float>& radius, const vector<float>& scale)
{
unsigned int atoms = atom.size();
for (unsigned int i = 0; i < atoms; i++)
{
gpu->psObcData->_pSysStream[0][i].x = radius[i] - dielectricOffset;
gpu->psObcData->_pSysStream[0][i].y = scale[i] * gpu->psObcData->_pSysStream[0][i].x;
#if (DUMP_PARAMETERS == 1)
cout <<
i << " " <<
gpu->psObcData->_pSysStream[0][i].x << " " <<
gpu->psObcData->_pSysStream[0][i].y;
#endif
}
// Dummy out extra atom data
for (unsigned int i = atoms; i < gpu->sim.paddedNumberOfAtoms; i++)
{
gpu->psBornRadii->_pSysStream[0][i] = 0.2f;
gpu->psObcData->_pSysStream[0][i].x = 0.01f;
gpu->psObcData->_pSysStream[0][i].y = 0.01f;
}
gpu->psBornRadii->Upload();
gpu->psObcData->Upload();
gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
}
extern "C"
int gpuReadShakeParameters(gpuContext gpu, char* fname)
{
ifstream infile(fname);
if (!infile.fail())
{
char buff[512];
int shake_constraints;
infile >> buff >> shake_constraints;
infile.getline(buff, 512);
vector<int> atom1(shake_constraints);
vector<int> atom2(shake_constraints);
vector<float> distance(shake_constraints);
vector<float> invMass1(shake_constraints);
vector<float> invMass2(shake_constraints);
for (int i = 0; i < shake_constraints; i++)
{
int junk;
infile >>
junk >>
atom1[i] >>
atom2[i] >>
distance[i] >>
invMass1[i] >>
invMass2[i];
}
gpuSetShakeParameters(gpu, atom1, atom2, distance, invMass1, invMass2, 1e-4f);
return gpu->sim.ShakeConstraints;
}
else
{
cout << "Error opening Shake parameter file " << fname << endl;
exit(-1);
}
return 0;
}
extern "C"
void gpuSetShakeParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<float>& distance,
const vector<float>& invMass1, const vector<float>& invMass2, float tolerance)
{
// Find how many constraints each atom is involved in.
vector<int> constraintCount(gpu->natoms, 0);
for (int i = 0; i < atom1.size(); i++) {
constraintCount[atom1[i]]++;
constraintCount[atom2[i]]++;
}
// Find clusters consisting of a central atom with up to three peripheral atoms.
map<int, ShakeCluster> clusters;
for (int i = 0; i < atom1.size(); i++) {
// Determine which is the central atom.
bool firstIsCentral;
if (constraintCount[atom1[i]] > 1)
firstIsCentral = true;
else if (constraintCount[atom2[i]] > 1)
firstIsCentral = false;
else if (atom1[i] < atom2[i])
firstIsCentral = true;
else
firstIsCentral = false;
int centralID, peripheralID;
float centralInvMass, peripheralInvMass;
if (firstIsCentral) {
centralID = atom1[i];
peripheralID = atom2[i];
centralInvMass = invMass1[i];
peripheralInvMass = invMass2[i];
}
else {
centralID = atom2[i];
peripheralID = atom1[i];
centralInvMass = invMass2[i];
peripheralInvMass = invMass1[i];
}
if (constraintCount[peripheralID] != 1)
throw OpenMMException("Only bonds to hydrogens may be constrained");
// Add it to the cluster.
if (clusters.find(centralID) == clusters.end()) {
clusters[centralID] = ShakeCluster(centralID, centralInvMass);
}
clusters[centralID].addAtom(peripheralID, distance[i], peripheralInvMass);
}
// Fill in the Cuda streams.
CUDAStream<int4>* psShakeID = new CUDAStream<int4>((int) clusters.size(), 1);
gpu->psShakeID = psShakeID;
gpu->sim.pShakeID = psShakeID->_pDevStream[0];
CUDAStream<float4>* psShakeParameter = new CUDAStream<float4>((int) clusters.size(), 1);
gpu->psShakeParameter = psShakeParameter;
gpu->sim.pShakeParameter = psShakeParameter->_pDevStream[0];
gpu->sim.ShakeConstraints = clusters.size();
int index = 0;
for (map<int, ShakeCluster>::const_iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
const ShakeCluster& cluster = iter->second;
psShakeID->_pSysStream[0][index].x = cluster.centralID;
psShakeID->_pSysStream[0][index].y = cluster.peripheralID[0];
psShakeID->_pSysStream[0][index].z = cluster.size > 1 ? cluster.peripheralID[1] : -1;
psShakeID->_pSysStream[0][index].w = cluster.size > 2 ? cluster.peripheralID[2] : -1;
psShakeParameter->_pSysStream[0][index].x = cluster.centralInvMass;
psShakeParameter->_pSysStream[0][index].y = 0.5f/(cluster.centralInvMass+cluster.peripheralInvMass);
psShakeParameter->_pSysStream[0][index].z = cluster.distance*cluster.distance;
psShakeParameter->_pSysStream[0][index].w = cluster.peripheralInvMass;
++index;
}
psShakeID->Upload();
psShakeParameter->Upload();
gpu->sim.shakeTolerance = tolerance;
gpu->sim.shake_threads_per_block = (gpu->sim.ShakeConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
if (gpu->sim.shake_threads_per_block > gpu->sim.max_shake_threads_per_block)
gpu->sim.shake_threads_per_block = gpu->sim.max_shake_threads_per_block;
if (gpu->sim.shake_threads_per_block < 1)
gpu->sim.shake_threads_per_block = 1;
#ifdef DeltaShake
// count number of atoms w/o constraint
int count = 0;
for (int i = 0; i < gpu->natoms; i++)
if (constraintCount[i] == 0)
count++;
// Allocate NonShake parameters
gpu->sim.NonShakeConstraints = count;
if( count || true ){
CUDAStream<int>* psNonShakeID = new CUDAStream<int>(count, 1);
gpu->psNonShakeID = psNonShakeID;
gpu->sim.pNonShakeID = psNonShakeID->_pDevStream[0];
gpu->sim.nonshake_threads_per_block = (count + gpu->sim.blocks - 1) / gpu->sim.blocks;
if (gpu->sim.nonshake_threads_per_block > gpu->sim.max_shake_threads_per_block)
gpu->sim.nonshake_threads_per_block = gpu->sim.max_shake_threads_per_block;
if (gpu->sim.nonshake_threads_per_block < 1)
gpu->sim.nonshake_threads_per_block = 1;
// load indices
count = 0;
for (int i = 0; i < gpu->natoms; i++){
if (constraintCount[i] == 0){
psNonShakeID->_pSysStream[0][count++] = i;
}
}
psNonShakeID->Upload();
} else {
gpu->sim.nonshake_threads_per_block = 0;
}
#endif
}
extern "C"
int gpuAllocateInitialBuffers(gpuContext gpu)
{
gpu->sim.atoms = gpu->natoms;
gpu->sim.paddedNumberOfAtoms = ((gpu->sim.atoms + GRID - 1) >> GRIDBITS) << GRIDBITS;
gpu->sim.degreesOfFreedom = 3 * gpu->sim.atoms - 6;
gpu->gpAtomTable = NULL;
gpu->gAtomTypes = 0;
gpu->sim.nonbondOutputBuffers = gpu->sim.paddedNumberOfAtoms / GRID;
gpu->sim.totalNonbondOutputBuffers = 2 * gpu->sim.nonbondOutputBuffers;
gpu->sim.outputBuffers = gpu->sim.totalNonbondOutputBuffers;
gpu->psPosq4 = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
gpu->sim.stride = gpu->psPosq4->_stride;
gpu->sim.stride2 = gpu->sim.stride * 2;
gpu->sim.stride3 = gpu->sim.stride * 3;
gpu->sim.stride4 = gpu->sim.stride * 4;
gpu->sim.pPosq = gpu->psPosq4->_pDevStream[0];
gpu->sim.stride = gpu->psPosq4->_stride;
gpu->sim.stride2 = 2 * gpu->sim.stride;
gpu->sim.stride3 = 3 * gpu->sim.stride;
gpu->sim.stride4 = 4 * gpu->sim.stride;
gpu->sim.exclusionStride = gpu->sim.stride / GRID;
gpu->psPosqP4 = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
gpu->sim.pPosqP = gpu->psPosqP4->_pDevStream[0];
gpu->psOldPosq4 = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
gpu->sim.pOldPosq = gpu->psOldPosq4->_pDevStream[0];
gpu->psVelm4 = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
gpu->sim.pVelm4 = gpu->psVelm4->_pDevStream[0];
gpu->psvVector4 = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
gpu->sim.pvVector4 = gpu->psvVector4->_pDevStream[0];
gpu->psxVector4 = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1);
gpu->sim.pxVector4 = gpu->psxVector4->_pDevStream[0];
gpu->psBornRadii = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, 1);
gpu->sim.pBornRadii = gpu->psBornRadii->_pDevStream[0];
gpu->psObcChain = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, 1);
gpu->sim.pObcChain = gpu->psObcChain->_pDevStream[0];
gpu->psSigEps2 = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1);
gpu->sim.pAttr = gpu->psSigEps2->_pDevStream[0];
gpu->psObcData = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1);
gpu->sim.pObcData = gpu->psObcData->_pDevStream[0];
gpu->pAtomSymbol = new unsigned char[gpu->natoms];
// Determine randoms
gpu->seed = (unsigned long)time(NULL) & 0x000fffff;
gpu->sim.randomFrames = 995;
gpu->sim.randomIterations = gpu->sim.randomFrames;
gpu->sim.randoms = gpu->sim.randomFrames * gpu->sim.paddedNumberOfAtoms - 5 * GRID;
gpu->sim.totalRandoms = gpu->sim.randoms + gpu->sim.paddedNumberOfAtoms;
gpu->sim.totalRandomsTimesTwo = gpu->sim.totalRandoms * 2;
gpu->psRandom4 = new CUDAStream<float4>(gpu->sim.totalRandomsTimesTwo, 1);
gpu->psRandom2 = new CUDAStream<float2>(gpu->sim.totalRandomsTimesTwo, 1);
gpu->psRandomPosition = new CUDAStream<int>(gpu->sim.blocks, 1);
gpu->psRandomSeed = new CUDAStream<uint4>(gpu->sim.blocks * gpu->sim.random_threads_per_block, 1);
gpu->sim.pRandom4a = gpu->psRandom4->_pDevStream[0];
gpu->sim.pRandom2a = gpu->psRandom2->_pDevStream[0];
gpu->sim.pRandom4b = gpu->psRandom4->_pDevStream[0] + gpu->sim.totalRandoms;
gpu->sim.pRandom2b = gpu->psRandom2->_pDevStream[0] + gpu->sim.totalRandoms;
gpu->sim.pRandomPosition = gpu->psRandomPosition->_pDevStream[0];
gpu->sim.pRandomSeed = gpu->psRandomSeed->_pDevStream[0];
for (int i = 0; i < (int) gpu->sim.blocks; i++)
{
gpu->psRandomPosition->_pSysStream[0][i] = 0;
}
int seed = gpu->seed | ((gpu->seed ^ 0xffffffff) << 16);
srand(seed);
for (int i = 0; i < (int) (gpu->sim.blocks * gpu->sim.random_threads_per_block); i++)
{
gpu->psRandomSeed->_pSysStream[0][i].x = rand();
gpu->psRandomSeed->_pSysStream[0][i].y = rand();
gpu->psRandomSeed->_pSysStream[0][i].z = rand();
gpu->psRandomSeed->_pSysStream[0][i].w = rand();
}
float randomValue = 0.0f;
for (int i = 0; i < (int) gpu->sim.totalRandomsTimesTwo; i++)
{
gpu->psRandom4->_pSysStream[0][i].x = randomValue;
gpu->psRandom4->_pSysStream[0][i].y = randomValue;
gpu->psRandom4->_pSysStream[0][i].z = randomValue;
gpu->psRandom4->_pSysStream[0][i].w = randomValue;
gpu->psRandom2->_pSysStream[0][i].x = randomValue;
gpu->psRandom2->_pSysStream[0][i].y = randomValue;
}
gpu->psRandomSeed->Upload();
gpu->psRandom4->Upload();
gpu->psRandom2->Upload();
gpu->psRandomPosition->Upload();
// Allocate and clear linear momentum buffer
gpu->psLinearMomentum = new CUDAStream<float4>(gpu->sim.blocks, 1);
gpu->sim.pLinearMomentum = gpu->psLinearMomentum->_pDevStream[0];
for (int i = 0; i < (int) gpu->sim.blocks; i++)
{
gpu->psLinearMomentum->_pSysStream[0][i].x = 0.0f;
gpu->psLinearMomentum->_pSysStream[0][i].y = 0.0f;
gpu->psLinearMomentum->_pSysStream[0][i].z = 0.0f;
gpu->psLinearMomentum->_pSysStream[0][i].w = 0.0f;
}
gpu->psLinearMomentum->Upload();
return 1;
}
extern "C"
void gpuReadCoordinates(gpuContext gpu, char* fname)
{
ifstream infile(fname);
gpu->natoms = 0;
char buff[512];
infile >> buff >> gpu->natoms;
infile.getline(buff, 511);
float totalMass = 0.0f;
gpuAllocateInitialBuffers(gpu);
for (int i = 0; i < gpu->natoms; i++)
{
int junk;
infile >> junk >>
gpu->psPosq4->_pSysStream[0][i].x >>
gpu->psPosq4->_pSysStream[0][i].y >>
gpu->psPosq4->_pSysStream[0][i].z >>
gpu->psPosq4->_pSysStream[0][i].w >>
gpu->psVelm4->_pSysStream[0][i].x >>
gpu->psVelm4->_pSysStream[0][i].y >>
gpu->psVelm4->_pSysStream[0][i].z >>
gpu->psVelm4->_pSysStream[0][i].w;
gpu->psxVector4->_pSysStream[0][i].x = 0.0f;
gpu->psxVector4->_pSysStream[0][i].y = 0.0f;
gpu->psxVector4->_pSysStream[0][i].z = 0.0f;
gpu->psxVector4->_pSysStream[0][i].w = 0.0f;
// Accumulate mass
totalMass += 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
}
gpu->sim.inverseTotalMass = 1.0f / totalMass;
gpu->psPosq4->Upload();
gpu->psVelm4->Upload();
gpu->psxVector4->Upload();
}
extern "C"
void gpuSetPositions(gpuContext gpu, const vector<float>& x, const vector<float>& y, const vector<float>& z)
{
for (int i = 0; i < gpu->natoms; i++)
{
gpu->psPosq4->_pSysStream[0][i].x = x[i];
gpu->psPosq4->_pSysStream[0][i].y = y[i];
gpu->psPosq4->_pSysStream[0][i].z = z[i];
}
gpu->psPosq4->Upload();
// set flag to recalculate Born radii
gpu->bRecalculateBornRadii = true;
}
extern "C"
void gpuSetVelocities(gpuContext gpu, const vector<float>& x, const vector<float>& y, const vector<float>& z)
{
for (int i = 0; i < gpu->natoms; i++)
{
gpu->psVelm4->_pSysStream[0][i].x = x[i];
gpu->psVelm4->_pSysStream[0][i].y = y[i];
gpu->psVelm4->_pSysStream[0][i].z = z[i];
}
gpu->psVelm4->Upload();
}
extern "C"
void gpuSetMass(gpuContext gpu, const vector<float>& mass)
{
float totalMass = 0.0f;
for (int i = 0; i < gpu->natoms; i++)
{
gpu->psVelm4->_pSysStream[0][i].w = 1.0f/mass[i];
totalMass += mass[i];
}
gpu->sim.inverseTotalMass = 1.0f / totalMass;
gpu->psVelm4->Upload();
}
extern "C"
void gpuInitializeRandoms(gpuContext gpu)
{
for (int i = 0; i < (int) gpu->sim.blocks; i++)
{
gpu->psRandomPosition->_pSysStream[0][i] = 0;
}
int seed = gpu->seed | ((gpu->seed ^ 0xffffffff) << 16);
srand(seed);
for (int i = 0; i < (int) (gpu->sim.blocks * gpu->sim.random_threads_per_block); i++)
{
gpu->psRandomSeed->_pSysStream[0][i].x = rand();
gpu->psRandomSeed->_pSysStream[0][i].y = rand();
gpu->psRandomSeed->_pSysStream[0][i].z = rand();
gpu->psRandomSeed->_pSysStream[0][i].w = rand();
}
gpu->psRandomPosition->Upload();
gpu->psRandomSeed->Upload();
gpuSetConstants(gpu);
kGenerateRandoms(gpu);
return;
}
extern "C"
bool gpuIsAvailable()
{
int deviceCount;
cudaGetDeviceCount(&deviceCount);
return (deviceCount > 0);
}
extern "C"
void* gpuInitFromFile(char* fname)
{
ifstream infile(fname);
int numAtoms = 0;
char buff[512];
infile >> buff >> numAtoms;
gpuContext gpu = (gpuContext) gpuInit(numAtoms);
vector<float> x(numAtoms), y(numAtoms), z(numAtoms), charge(numAtoms), vx(numAtoms), vy(numAtoms), vz(numAtoms), mass(numAtoms);
infile.getline(buff, 511);
float totalMass = 0.0f;
for (int i = 0; i < gpu->natoms; i++)
{
int junk;
infile >> junk >>
x[i] >>
y[i] >>
z[i] >>
charge[i] >>
vx[i] >>
vy[i] >>
vz[i] >>
mass[i];
mass[i] = 1.0f/mass[i];
}
gpuSetPositions(gpu, x, y, z);
gpuSetVelocities(gpu, vx, vy, vz);
gpuSetMass(gpu, mass);
return (void*)gpu;
}
extern "C"
void* gpuInit(int numAtoms)
{
gpuContext gpu = new _gpuContext;
int LRFSize = 0;
int SMCount = 0;
int SMMajor = 0;
int SMMinor = 0;
// Get adapter
unsigned int device = 0;
char * pAdapter;
pAdapter = getenv ("NV_FAH_DEVICE");
if (pAdapter != NULL)
{
sscanf(pAdapter, "%d", &device);
}
cudaError_t status = cudaSetDevice(device);
RTERROR(status, "Error setting CUDA device")
// Determine which core to run on
#if 0
SYSTEM_INFO info;
GetSystemInfo(&info);
unsigned int cores = info.dwNumberOfProcessors;
if (cores > 1)
{
HANDLE hproc = GetCurrentProcess();
unsigned int core = (cores - 1) - (device % (cores - 1));
unsigned int mask = 1 << core;
SetProcessAffinityMask(hproc, mask);
}
#endif
// Determine kernel call configuration
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
// Determine SM version
if (deviceProp.major == 1)
{
switch (deviceProp.minor)
{
case 0:
case 1:
gpu->sm_version = SM_10;
gpu->sim.workUnitsPerSM = G8X_NONBOND_WORKUNITS_PER_SM;
break;
default:
gpu->sm_version = SM_12;
gpu->sim.workUnitsPerSM = GT2XX_NONBOND_WORKUNITS_PER_SM;
break;
}
}
gpu->sim.nonbond_blocks = deviceProp.multiProcessorCount;
gpu->sim.bornForce2_blocks = deviceProp.multiProcessorCount;
gpu->sim.blocks = deviceProp.multiProcessorCount;
if (deviceProp.regsPerBlock == 8192)
{
gpu->sim.nonbond_threads_per_block = G8X_NONBOND_THREADS_PER_BLOCK;
gpu->sim.bornForce2_threads_per_block = G8X_BORNFORCE2_THREADS_PER_BLOCK;
gpu->sim.max_shake_threads_per_block = G8X_SHAKE_THREADS_PER_BLOCK;
gpu->sim.max_update_threads_per_block = G8X_UPDATE_THREADS_PER_BLOCK;
gpu->sim.max_localForces_threads_per_block = G8X_LOCALFORCES_THREADS_PER_BLOCK;
gpu->sim.threads_per_block = G8X_THREADS_PER_BLOCK;
gpu->sim.random_threads_per_block = G8X_RANDOM_THREADS_PER_BLOCK;
}
else
{
gpu->sim.nonbond_threads_per_block = GT2XX_NONBOND_THREADS_PER_BLOCK;
gpu->sim.bornForce2_threads_per_block = GT2XX_BORNFORCE2_THREADS_PER_BLOCK;
gpu->sim.max_shake_threads_per_block = GT2XX_SHAKE_THREADS_PER_BLOCK;
gpu->sim.max_update_threads_per_block = GT2XX_UPDATE_THREADS_PER_BLOCK;
gpu->sim.max_localForces_threads_per_block = GT2XX_LOCALFORCES_THREADS_PER_BLOCK;
gpu->sim.threads_per_block = GT2XX_NONBOND_THREADS_PER_BLOCK;
gpu->sim.random_threads_per_block = GT2XX_RANDOM_THREADS_PER_BLOCK;
}
gpu->sim.shake_threads_per_block = gpu->sim.max_shake_threads_per_block;
gpu->sim.localForces_threads_per_block = gpu->sim.max_localForces_threads_per_block;
gpu->natoms = numAtoms;
gpuAllocateInitialBuffers(gpu);
for (int i = 0; i < gpu->natoms; i++)
{
gpu->psxVector4->_pSysStream[0][i].x = 0.0f;
gpu->psxVector4->_pSysStream[0][i].y = 0.0f;
gpu->psxVector4->_pSysStream[0][i].z = 0.0f;
gpu->psxVector4->_pSysStream[0][i].w = 0.0f;
}
gpu->psxVector4->Upload();
gpu->iterations = 0;
gpu->sim.update_threads_per_block = (gpu->natoms + gpu->sim.blocks - 1) / gpu->sim.blocks;
if (gpu->sim.update_threads_per_block > gpu->sim.max_update_threads_per_block)
gpu->sim.update_threads_per_block = gpu->sim.max_update_threads_per_block;
if (gpu->sim.update_threads_per_block < 1)
gpu->sim.update_threads_per_block = 1;
gpu->sim.bf_reduce_threads_per_block = gpu->sim.update_threads_per_block;
gpu->sim.bsf_reduce_threads_per_block = (gpu->sim.stride4 + gpu->natoms + gpu->sim.blocks - 1) / gpu->sim.blocks;
gpu->sim.bsf_reduce_threads_per_block = ((gpu->sim.bsf_reduce_threads_per_block + (GRID - 1)) / GRID) * GRID;
if (gpu->sim.bsf_reduce_threads_per_block > gpu->sim.threads_per_block)
gpu->sim.bsf_reduce_threads_per_block = gpu->sim.threads_per_block;
if (gpu->sim.bsf_reduce_threads_per_block < 1)
gpu->sim.bsf_reduce_threads_per_block = 1;
// Initialize constants to reasonable values
gpu->sim.probeRadius = probeRadius;
gpu->sim.surfaceAreaFactor = surfaceAreaFactor;
gpu->sim.electricConstant = electricConstant;
gpu->sim.bigFloat = 99999999.0f;
gpu->sim.forceConversionFactor = forceConversionFactor;
gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/defaultInnerDielectric)-(1.0f/defaultSolventDielectric))*gpu->sim.forceConversionFactor;
gpu->sim.dielectricOffset = dielectricOffset;
gpu->sim.alphaOBC = alphaOBC;
gpu->sim.betaOBC = betaOBC;
gpu->sim.gammaOBC = gammaOBC;
gpuSetIntegrationParameters(gpu, 1.0f, 2.0e-3f, 300.0f);
gpu->sim.maxShakeIterations = 15;
gpu->sim.shakeTolerance = 1.0e-04f * 2.0f;
gpu->sim.InvMassJ = 9.920635e-001f;
gpu->grid = GRID;
gpu->bCalculateCM = false;
gpu->bRemoveCM = false;
gpu->bRecalculateBornRadii = true;
gpuInitializeRandoms(gpu);
// To be determined later
gpu->psLJ14ID = NULL;
gpu->psForce4 = NULL;
gpu->sim.pForce4 = NULL;
gpu->sim.pForce4a = NULL;
gpu->sim.pForce4b = NULL;
gpu->psBornForce = NULL;
gpu->sim.pBornForce = NULL;
gpu->psBornSum = NULL;
gpu->sim.pBornSum = NULL;
gpu->psBondID = NULL;
gpu->psBondParameter = NULL;
gpu->psBondAngleID1 = NULL;
gpu->psBondAngleID2 = NULL;
gpu->psBondAngleParameter = NULL;
gpu->psDihedralID1 = NULL;
gpu->psDihedralID2 = NULL;
gpu->psDihedralParameter = NULL;
gpu->psRbDihedralID1 = NULL;
gpu->psRbDihedralID2 = NULL;
gpu->psRbDihedralParameter1 = NULL;
gpu->psRbDihedralParameter2 = NULL;
gpu->psLJ14ID = NULL;
gpu->psLJ14Parameter = NULL;
gpu->psShakeID = NULL;
gpu->psShakeParameter = NULL;
gpu->psExclusion = NULL;
gpu->psWorkUnit = NULL;
// Initialize output buffer before reading parameters
gpu->pOutputBufferCounter = new unsigned int[gpu->sim.paddedNumberOfAtoms];
memset(gpu->pOutputBufferCounter, 0, gpu->sim.paddedNumberOfAtoms * sizeof(unsigned int));
// Initialize exclusion array
gpu->pExclusion = new unsigned int[gpu->sim.paddedNumberOfAtoms * gpu->sim.paddedNumberOfAtoms];
for (unsigned int i = 0; i < gpu->sim.paddedNumberOfAtoms * gpu->sim.paddedNumberOfAtoms; i++)
gpu->pExclusion[i] = 1;
return (void*)gpu;
}
extern "C"
void gpuSetIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature) {
gpu->sim.deltaT = deltaT;
gpu->sim.oneOverDeltaT = 1.0f/deltaT;
gpu->sim.tau = tau;
gpu->sim.GDT = gpu->sim.deltaT / gpu->sim.tau;
gpu->sim.EPH = exp(0.5f * gpu->sim.GDT);
gpu->sim.EMH = exp(-0.5f * gpu->sim.GDT);
gpu->sim.EP = exp(gpu->sim.GDT);
gpu->sim.EM = exp(-gpu->sim.GDT);
gpu->sim.OneMinusEM = 1.0f - gpu->sim.EM;
gpu->sim.TauOneMinusEM = gpu->sim.tau * gpu->sim.OneMinusEM;
if (gpu->sim.GDT >= 0.1f)
{
float term1 = gpu->sim.EPH - 1.0f;
term1 *= term1;
gpu->sim.B = gpu->sim.GDT * (gpu->sim.EP - 1.0f) - 4.0f * term1;
gpu->sim.C = gpu->sim.GDT - 3.0f + 4.0f * gpu->sim.EMH - gpu->sim.EM;
gpu->sim.D = 2.0f - gpu->sim.EPH - gpu->sim.EMH;
}
else
{
float term1 = 0.5f * gpu->sim.GDT;
float term2 = term1 * term1;
float term4 = term2 * term2;
float third = 1.0f / 3.0f;
float o7_9 = 7.0f / 9.0f;
float o1_12 = 1.0f / 12.0f;
float o17_90 = 17.0f / 90.0f;
float o7_30 = 7.0f / 30.0f;
float o31_1260 = 31.0f / 1260.0f;
float o_360 = 1.0f / 360.0f;
gpu->sim.B = term4 * (third + term1 * (third + term1 * (o17_90 + term1 * o7_9)));
gpu->sim.C = term2 * term1 * (2.0f * third + term1 * (-0.5f + term1 * (o7_30 + term1 * (-o1_12 + term1 * o31_1260))));
gpu->sim.D = term2 * (-1.0f + term2 * (-o1_12 - term2 * o_360));
}
gpu->sim.TauDOverEMMinusOne = gpu->sim.tau * gpu->sim.D / (gpu->sim.EM - 1.0f);
gpu->sim.DOverTauC = gpu->sim.D / (gpu->sim.tau * gpu->sim.C);
gpu->sim.fix1 = gpu->sim.tau * (gpu->sim.EPH - gpu->sim.EMH);
gpu->sim.oneOverFix1 = 1.0f / (gpu->sim.tau * (gpu->sim.EPH - gpu->sim.EMH));
gpu->sim.T = temperature;
gpu->sim.kT = BOLTZ * gpu->sim.T;
gpu->sim.V = sqrt(gpu->sim.kT * (1.0f - gpu->sim.EM));
gpu->sim.X = gpu->sim.tau * sqrt(gpu->sim.kT * gpu->sim.C);
gpu->sim.Yv = sqrt(gpu->sim.kT * gpu->sim.B / gpu->sim.C);
gpu->sim.Yx = gpu->sim.tau * sqrt(gpu->sim.kT * gpu->sim.B / (1.0f - gpu->sim.EM));
}
extern "C"
void gpuSetVerletIntegrationParameters(gpuContext gpu, float deltaT) {
gpu->sim.deltaT = deltaT;
gpu->sim.oneOverDeltaT = 1.0f/deltaT;
}
extern "C"
void gpuSetBrownianIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature) {
gpu->sim.deltaT = deltaT;
gpu->sim.oneOverDeltaT = 1.0f/deltaT;
gpu->sim.tau = tau;
gpu->sim.GDT = gpu->sim.deltaT * gpu->sim.tau;
gpu->sim.T = temperature;
gpu->sim.kT = BOLTZ * gpu->sim.T;
gpu->sim.Yv = gpu->sim.Yx = sqrt(2.0f*gpu->sim.kT*deltaT*tau);
}
extern "C"
void gpuSetAndersenThermostatParameters(gpuContext gpu, float temperature, float collisionProbability) {
gpu->sim.T = temperature;
gpu->sim.kT = BOLTZ * gpu->sim.T;
gpu->sim.collisionProbability = collisionProbability;
gpu->sim.Yv = gpu->sim.Yx = 1.0f;
gpu->sim.V = gpu->sim.X = 1.0f;
}
extern "C"
void gpuShutDown(gpuContext gpu)
{
// Delete sysmem pointers
delete[] gpu->pOutputBufferCounter;
delete[] gpu->pExclusion;
delete[] gpu->gpAtomTable;
delete[] gpu->pAtomSymbol;
// Delete device pointers
delete gpu->psPosq4;
delete gpu->psPosqP4;
delete gpu->psOldPosq4;
delete gpu->psVelm4;
delete gpu->psForce4;
delete gpu->psxVector4;
delete gpu->psvVector4;
delete gpu->psSigEps2;
delete gpu->psObcData;
delete gpu->psObcChain;
delete gpu->psBornForce;
delete gpu->psBornRadii;
delete gpu->psBornSum;
delete gpu->psBondID;
delete gpu->psBondParameter;
delete gpu->psBondAngleID1;
delete gpu->psBondAngleID2;
delete gpu->psBondAngleParameter;
delete gpu->psDihedralID1;
delete gpu->psDihedralID2;
delete gpu->psDihedralParameter;
delete gpu->psRbDihedralID1;
delete gpu->psRbDihedralID2;
delete gpu->psRbDihedralParameter1;
delete gpu->psRbDihedralParameter2;
delete gpu->psLJ14ID;
delete gpu->psLJ14Parameter;
delete gpu->psShakeID;
delete gpu->psShakeParameter;
delete gpu->psExclusion;
delete gpu->psWorkUnit;
delete gpu->psRandom4;
delete gpu->psRandom2;
delete gpu->psRandomPosition;
delete gpu->psRandomSeed;
delete gpu->psLinearMomentum;
// Wrap up
delete gpu;
return;
}
extern "C"
int gpuBuildOutputBuffers(gpuContext gpu)
{
unsigned int outputBuffers = gpu->sim.totalNonbondOutputBuffers;
for (unsigned int i = 0; i < gpu->sim.paddedNumberOfAtoms; i++)
{
if (outputBuffers < gpu->pOutputBufferCounter[i])
{
outputBuffers = gpu->pOutputBufferCounter[i];
}
}
gpu->sim.outputBuffers = outputBuffers;
gpu->psForce4 = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, outputBuffers);
gpu->psBornForce = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers);
gpu->psBornSum = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers);
gpu->sim.pForce4 = gpu->psForce4->_pDevStream[0];
gpu->sim.pForce4a = gpu->sim.pForce4;
gpu->sim.pForce4b = gpu->sim.pForce4 + 1 * gpu->sim.nonbondOutputBuffers * gpu->sim.stride;
gpu->sim.pBornForce = gpu->psBornForce->_pDevStream[0];
gpu->sim.pBornSum = gpu->psBornSum->_pDevStream[0];
// Determine local energy paramter offsets for bonded interactions
gpu->sim.bond_offset = gpu->psBondParameter->_stride;
gpu->sim.bond_angle_offset = gpu->sim.bond_offset + gpu->psBondAngleParameter->_stride;
gpu->sim.dihedral_offset = gpu->sim.bond_angle_offset + gpu->psDihedralParameter->_stride;
gpu->sim.rb_dihedral_offset = gpu->sim.dihedral_offset + gpu->psRbDihedralParameter1->_stride;
gpu->sim.LJ14_offset = gpu->sim.rb_dihedral_offset + gpu->psLJ14Parameter->_stride;
gpu->sim.localForces_threads_per_block = (gpu->sim.LJ14_offset / gpu->sim.blocks + 15) & 0xfffffff0;
if (gpu->sim.localForces_threads_per_block > gpu->sim.max_localForces_threads_per_block)
gpu->sim.localForces_threads_per_block = gpu->sim.max_localForces_threads_per_block;
if (gpu->sim.localForces_threads_per_block < 1)
gpu->sim.localForces_threads_per_block = 1;
// Flip local force output buffers
int flip = outputBuffers - 1;
for (int i = 0; i < (int) gpu->sim.bonds; i++)
{
gpu->psBondID->_pSysStream[0][i].z = flip - gpu->psBondID->_pSysStream[0][i].z;
gpu->psBondID->_pSysStream[0][i].w = flip - gpu->psBondID->_pSysStream[0][i].w;
}
for (int i = 0; i < (int) gpu->sim.bond_angles; i++)
{
gpu->psBondAngleID1->_pSysStream[0][i].w = flip - gpu->psBondAngleID1->_pSysStream[0][i].w;
gpu->psBondAngleID2->_pSysStream[0][i].x = flip - gpu->psBondAngleID2->_pSysStream[0][i].x;
gpu->psBondAngleID2->_pSysStream[0][i].y = flip - gpu->psBondAngleID2->_pSysStream[0][i].y;
}
for (int i = 0; i < (int) gpu->sim.dihedrals; i++)
{
gpu->psDihedralID2->_pSysStream[0][i].x = flip - gpu->psDihedralID2->_pSysStream[0][i].x;
gpu->psDihedralID2->_pSysStream[0][i].y = flip - gpu->psDihedralID2->_pSysStream[0][i].y;
gpu->psDihedralID2->_pSysStream[0][i].z = flip - gpu->psDihedralID2->_pSysStream[0][i].z;
gpu->psDihedralID2->_pSysStream[0][i].w = flip - gpu->psDihedralID2->_pSysStream[0][i].w;
}
for (int i = 0; i < (int) gpu->sim.rb_dihedrals; i++)
{
gpu->psRbDihedralID2->_pSysStream[0][i].x = flip - gpu->psRbDihedralID2->_pSysStream[0][i].x;
gpu->psRbDihedralID2->_pSysStream[0][i].y = flip - gpu->psRbDihedralID2->_pSysStream[0][i].y;
gpu->psRbDihedralID2->_pSysStream[0][i].z = flip - gpu->psRbDihedralID2->_pSysStream[0][i].z;
gpu->psRbDihedralID2->_pSysStream[0][i].w = flip - gpu->psRbDihedralID2->_pSysStream[0][i].w;
}
for (int i = 0; i < (int) gpu->sim.LJ14s; i++)
{
gpu->psLJ14ID->_pSysStream[0][i].z = flip - gpu->psLJ14ID->_pSysStream[0][i].z;
gpu->psLJ14ID->_pSysStream[0][i].w = flip - gpu->psLJ14ID->_pSysStream[0][i].w;
}
gpu->psBondID->Upload();
gpu->psBondAngleID1->Upload();
gpu->psBondAngleID2->Upload();
gpu->psDihedralID2->Upload();
gpu->psRbDihedralID2->Upload();
gpu->psLJ14ID->Upload();
return 1;
}
extern "C"
int gpuBuildThreadBlockWorkList(gpuContext gpu)
{
const unsigned int atoms = gpu->sim.paddedNumberOfAtoms;
const unsigned int grid = gpu->grid;
const unsigned int dim = (atoms + (grid - 1)) / grid;
const unsigned int cells = dim * (dim + 1) / 2;
const unsigned int* pExclusion = gpu->pExclusion;
CUDAStream<unsigned int>* psWorkUnit = new CUDAStream<unsigned int>(cells, 1u);
unsigned int* pWorkList = psWorkUnit->_pSysStream[0];
gpu->psWorkUnit = psWorkUnit;
gpu->sim.pWorkUnit = psWorkUnit->_pDevStream[0];
gpu->sim.nonbond_workBlock = gpu->sim.nonbond_threads_per_block / GRID;
gpu->sim.bornForce2_workBlock = gpu->sim.bornForce2_threads_per_block / GRID;
gpu->sim.workUnits = cells;
// Increase block count if necessary for extra large molecules that would
// otherwise overflow the SM workunit buffers
int minimumBlocks = (cells + gpu->sim.workUnitsPerSM - 1) / gpu->sim.workUnitsPerSM;
if ((int) gpu->sim.nonbond_blocks < minimumBlocks)
{
gpu->sim.nonbond_blocks = gpu->sim.nonbond_blocks * ((minimumBlocks + gpu->sim.nonbond_blocks - 1) / gpu->sim.nonbond_blocks);
}
if ((int) gpu->sim.bornForce2_blocks < minimumBlocks)
{
gpu->sim.bornForce2_blocks = gpu->sim.bornForce2_blocks * ((minimumBlocks + gpu->sim.bornForce2_blocks - 1) / gpu->sim.bornForce2_blocks);
}
gpu->sim.nbWorkUnitsPerBlock = cells / gpu->sim.nonbond_blocks;
gpu->sim.nbWorkUnitsPerBlockRemainder = cells - gpu->sim.nonbond_blocks * gpu->sim.nbWorkUnitsPerBlock;
gpu->sim.bf2WorkUnitsPerBlock = cells / gpu->sim.bornForce2_blocks;
gpu->sim.bf2WorkUnitsPerBlockRemainder = cells - gpu->sim.bornForce2_blocks * gpu->sim.bf2WorkUnitsPerBlock;
// Decrease thread count for extra small molecules to spread computation
// across entire chip
int activeWorkUnits = gpu->sim.nonbond_blocks * gpu->sim.nonbond_workBlock;
if (activeWorkUnits > (int) cells)
{
int balancedWorkBlock = (cells + gpu->sim.nonbond_blocks - 1) / gpu->sim.nonbond_blocks;
gpu->sim.nonbond_threads_per_block = balancedWorkBlock * GRID;
gpu->sim.nonbond_workBlock = balancedWorkBlock;
}
activeWorkUnits = gpu->sim.bornForce2_blocks * gpu->sim.bornForce2_workBlock;
if (activeWorkUnits > (int) cells)
{
int balancedWorkBlock = (cells + gpu->sim.bornForce2_blocks - 1) / gpu->sim.bornForce2_blocks;
gpu->sim.bornForce2_threads_per_block = balancedWorkBlock * GRID;
gpu->sim.bornForce2_workBlock = balancedWorkBlock;
}
unsigned int count = 0;
for (unsigned int y = 0; y < dim; y++)
{
for (unsigned int x = y; x < dim; x++)
{
pWorkList[count] = (x << 17) | (y << 2);
// Check for exclusions
int exclusions = 0;
for (unsigned int i = y * grid; i < y * grid + grid; i++)
{
for (unsigned int j = x * grid; j < x * grid + grid; j++)
{
if (!pExclusion[i * atoms + j])
{
exclusions++;
}
}
}
// Signal exclusions if they exist
if (exclusions > 0)
pWorkList[count] |= 0x1;
count++;
}
}
psWorkUnit->Upload();
gpuSetConstants(gpu);
return cells;
}
extern "C"
int gpuBuildExclusionList(gpuContext gpu)
{
unsigned int atoms = gpu->sim.paddedNumberOfAtoms;
CUDAStream<unsigned int>* psExclusion = new CUDAStream<unsigned int>(atoms * atoms, 1u);
gpu->psExclusion = psExclusion;
gpu->sim.pExclusion = psExclusion->_pDevStream[0];
unsigned int* pExList = psExclusion->_pSysStream[0];
int exclusions = 0;
unsigned int pos = 0;
for (unsigned int x = 0; x < atoms; x += gpu->grid)
{
for (unsigned int y = 0; y < atoms; y += gpu->grid)
{
for (unsigned x1 = x; x1 < x + gpu->grid; x1++)
{
unsigned int mask = 0;
for (unsigned int y1 = y ; y1 < y + gpu->grid; y1++)
{
mask >>= 1;
if (gpu->pExclusion[x1 * atoms + y1] == 0)
{
if (x1 >= y1)
exclusions++;
}
else
mask |= 0x80000000;
}
pExList[pos++] = mask;
}
}
}
psExclusion->Upload();
gpuSetConstants(gpu);
return exclusions;
}
extern "C"
int gpuSetConstants(gpuContext gpu)
{
SetCalculateCDLJForcesSim(gpu);
SetCalculateCDLJObcGbsaForces1Sim(gpu);
SetCalculateLocalForcesSim(gpu);
SetCalculateObcGbsaBornSumSim(gpu);
SetCalculateObcGbsaForces1Sim(gpu);
SetCalculateObcGbsaForces2Sim(gpu);
SetCalculateAndersenThermostatSim(gpu);
SetForcesSim(gpu);
SetUpdateShakeHSim(gpu);
SetVerletUpdateSim(gpu);
SetBrownianUpdateSim(gpu);
SetRandomSim(gpu);
if (gpu->sm_version >= SM_12)
{
SetCalculateCDLJForces_12Sim(gpu);
SetCalculateCDLJObcGbsaForces1_12Sim(gpu);
SetCalculateObcGbsaForces1_12Sim(gpu);
SetCalculateObcGbsaForces2_12Sim(gpu);
}
return 1;
}
extern "C"
void gpuDumpCoordinates(gpuContext gpu)
{
gpu->psPosq4->Download();
gpu->psVelm4->Download();
(void) printf( "\n\nCoordinates and velocities\n" );
for (int i = 0; i < gpu->natoms; i++)
{
printf("%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i,
gpu->psPosq4->_pSysStream[0][i].x,
gpu->psPosq4->_pSysStream[0][i].y,
gpu->psPosq4->_pSysStream[0][i].z,
gpu->psPosq4->_pSysStream[0][i].w,
gpu->psVelm4->_pSysStream[0][i].x,
gpu->psVelm4->_pSysStream[0][i].y,
gpu->psVelm4->_pSysStream[0][i].z,
gpu->psVelm4->_pSysStream[0][i].w
);
}
}
bool ISNAN(float f)
{
return !(f == f);
}
extern "C"
bool gpuCheckData(gpuContext gpu)
{
gpu->psPosq4->Download();
gpu->psVelm4->Download();
gpu->psForce4->Download();
gpu->psBornForce->Download();
int violations = 0;
for (int i = 0; i < gpu->natoms; i++)
{
if (ISNAN( gpu->psPosq4->_pSysStream[0][i].x) ||
ISNAN( gpu->psPosq4->_pSysStream[0][i].y) ||
ISNAN( gpu->psPosq4->_pSysStream[0][i].z) ||
ISNAN( gpu->psVelm4->_pSysStream[0][i].x) ||
ISNAN( gpu->psVelm4->_pSysStream[0][i].y) ||
ISNAN( gpu->psVelm4->_pSysStream[0][i].z) ||
ISNAN( gpu->psForce4->_pSysStream[0][i].x) ||
ISNAN( gpu->psForce4->_pSysStream[0][i].y) ||
ISNAN( gpu->psForce4->_pSysStream[0][i].z) ||
ISNAN( gpu->psBornForce->_pSysStream[0][i]))
{
printf("%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i,
gpu->psPosq4->_pSysStream[0][i].x,
gpu->psPosq4->_pSysStream[0][i].y,
gpu->psPosq4->_pSysStream[0][i].z,
gpu->psVelm4->_pSysStream[0][i].x,
gpu->psVelm4->_pSysStream[0][i].y,
gpu->psVelm4->_pSysStream[0][i].z,
gpu->psForce4->_pSysStream[0][i].x,
gpu->psForce4->_pSysStream[0][i].y,
gpu->psForce4->_pSysStream[0][i].z,
gpu->psBornForce->_pSysStream[0][i]
);
violations++;
}
}
if (violations > 0)
{
printf("%d total violations\n", violations);
for (int i = 0; i < gpu->natoms; i++)
{
float dmin = 99999999.0f;
int closest = -9999;
float x = gpu->psPosq4->_pSysStream[0][i].x;
float y = gpu->psPosq4->_pSysStream[0][i].y;
float z = gpu->psPosq4->_pSysStream[0][i].z;
for (int j = 0; j < gpu->natoms; j++)
{
if (j != i)
{
float dx = gpu->psPosq4->_pSysStream[0][j].x - x;
float dy = gpu->psPosq4->_pSysStream[0][j].y - y;
float dz = gpu->psPosq4->_pSysStream[0][j].z - z;
float r = sqrt(dx * dx + dy * dy + dz * dz);
if (r < dmin)
{
dmin = r;
closest = j;
}
}
}
printf("Atom %4d: Closest neighbor is Atom %4d, %11.5e\n", i, closest, dmin);
}
gpuDumpAtomData(gpu);
kClearBornForces(gpu);
kClearForces(gpu);
kCPUCalculateLocalForces(gpu);
// Determine which forces have gone awry
kClearBornForces(gpu);
kClearForces(gpu);
kCalculateCDLJForces(gpu);
kReduceForces(gpu);
printf("Nonbond Forces\n");
gpuDumpForces(gpu);
kClearBornForces(gpu);
kClearForces(gpu);
kCalculateObcGbsaForces1(gpu);
kReduceObcGbsaBornForces(gpu);
kCalculateObcGbsaForces2(gpu);
kReduceForces(gpu);
printf("OBC Forces\n");
gpuDumpForces(gpu);
kClearBornForces(gpu);
kClearForces(gpu);
kCalculateLocalForces(gpu);
kReduceForces(gpu);
printf("Local Forces\n");
gpuDumpForces(gpu);
kClearBornForces(gpu);
kClearForces(gpu);
kReduceForces(gpu);
printf("Cleared Forces\n");
gpuDumpForces(gpu);
return false;
}
return true;
}
extern "C"
void kCPUCalculate14(gpuContext gpu)
{
gpu->psPosq4->Download();
gpu->psForce4->Download();
// gpu->psLJ14ID->Download();
// gpu->psLJ14Parameter->Download();
for (int pos = 0; pos < (int) gpu->sim.LJ14s; pos++)
{
int4 atom = gpu->psLJ14ID->_pSysStream[0][pos];
float4 LJ14 = gpu->psLJ14Parameter->_pSysStream[0][pos];
float4 a1 = gpu->psPosq4->_pSysStream[0][atom.x];
float4 a2 = gpu->psPosq4->_pSysStream[0][atom.y];
float3 d;
d.x = a1.x - a2.x;
d.y = a1.y - a2.y;
d.z = a1.z - a2.z;
float r2 = d.x * d.x + d.y * d.y + d.z * d.z;
float inverseR = 1.0f / sqrt(r2);
float sig2 = inverseR * LJ14.y;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
dEdR += LJ14.z * inverseR;
dEdR *= inverseR * inverseR;
unsigned int offsetA = atom.x + atom.z * gpu->sim.stride;
unsigned int offsetB = atom.y + atom.w * gpu->sim.stride;
float4 forceA = gpu->psForce4->_pSysStream[0][offsetA];
float4 forceB = gpu->psForce4->_pSysStream[0][offsetB];
d.x *= dEdR;
d.y *= dEdR;
d.z *= dEdR;
forceA.x += d.x;
forceA.y += d.y;
forceA.z += d.z;
forceB.x -= d.x;
forceB.y -= d.y;
forceB.z -= d.z;
gpu->psForce4->_pSysStream[0][offsetA] = forceA;
gpu->psForce4->_pSysStream[0][offsetB] = forceB;
printf("%4d: %4d - %4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", pos, atom.x, atom.y, r2, dEdR, sig2, sig6, LJ14.x, LJ14.z);
}
}
extern "C"
void gpuDumpPrimeCoordinates(gpuContext gpu)
{
gpu->psPosqP4->Download();
for (int i = 0; i < gpu->natoms; i++)
{
printf("%4d: %11.5f %11.5f %11.5f %11.5f\n", i,
gpu->psPosqP4->_pSysStream[0][i].x,
gpu->psPosqP4->_pSysStream[0][i].y,
gpu->psPosqP4->_pSysStream[0][i].z,
gpu->psPosqP4->_pSysStream[0][i].w
);
}
}
extern "C"
void gpuDumpForces(gpuContext gpu)
{
gpu->psForce4->Download();
gpu->psBornForce->Download();
for (int i = 0; i < gpu->natoms; i++)
{
char buff[512];
sprintf(buff, "%4d: %11.5f %11.5f %11.5f %11.5f\n", i,
gpu->psForce4->_pSysStream[0][i].x,
gpu->psForce4->_pSysStream[0][i].y,
gpu->psForce4->_pSysStream[0][i].z,
gpu->psBornForce->_pSysStream[0][i]
);
// OutputDebugString(buff);
}
}
extern "C"
void gpuDumpAtomData(gpuContext gpu)
{
gpu->psPosq4->Download();
gpu->psSigEps2->Download();
gpu->psBornRadii->Download();
gpu->psObcChain->Download();
for (int i = 0; i < gpu->natoms; i++)
{
char buff[512];
sprintf(buff, "%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i,
gpu->psPosq4->_pSysStream[0][i].x,
gpu->psPosq4->_pSysStream[0][i].y,
gpu->psPosq4->_pSysStream[0][i].z,
gpu->psPosq4->_pSysStream[0][i].w,
gpu->psSigEps2->_pSysStream[0][i].x,
gpu->psSigEps2->_pSysStream[0][i].y,
gpu->psBornRadii->_pSysStream[0][i],
gpu->psObcChain->_pSysStream[0][i]
);
// OutputDebugString((LPCWSTR)buff);
}
}
extern "C"
void gpuSetup(void* pVoid)
{
gpuContext gpu = (gpuContext)pVoid;
// Read parameters
cout << gpuReadAtomicParameters(gpu, "Data/atomicradii.txt") << " atom types\n";
cout << gpuReadBondParameters(gpu, "Data/GromacsHarmonicBondParameter.txt") << " bond parameters.\n";
cout << gpuReadBondAngleParameters(gpu, "Data/GromacsAngleBondParameter.txt") << " bond angle parameters.\n";
cout << gpuReadDihedralParameters(gpu, "Data/GromacsProperDihedralParameter.txt") << " proper dihedral parameters.\n";
cout << gpuReadRbDihedralParameters(gpu, "Data/GromacsRbDihedralParameter.txt") << " Ryckaert-Bellemans dihedral parameters.\n";
cout << gpuReadLJ14Parameters(gpu, "Data/GromacsLJ14Parameter.txt") << " Lennard-Jones 1-4 parameters.\n";
cout << gpuReadCoulombParameters(gpu, "Data/GromacsLJCoulombParameter.txt") << " Coulomb parameters.\n";
cout << gpuReadShakeParameters(gpu, "Data/GromacsShakeParameters.txt") << " shake parameters.\n";
// Build thread block work list
gpuBuildThreadBlockWorkList(gpu);
// Build exclusion list
gpuBuildExclusionList(gpu);
// Create output buffers
gpuBuildOutputBuffers(gpu);
// Set constant blocks
gpuSetConstants(gpu);
// Initialize randoms
gpuInitializeRandoms(gpu);
// Initialize Born Radii;
kCalculateObcGbsaBornSum(gpu);
kReduceObcGbsaBornSum(gpu);
kClearForces(gpu);
kClearBornForces(gpu);
return;
}
#define DOT3(v1, v2) (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z)
#define GETNORMEDDOTPRODUCT(v1, v2, dp) \
{ \
dp = DOT3(v1, v2); \
float norm1 = DOT3(v1, v1); \
float norm2 = DOT3(v2, v2); \
dp /= sqrt(norm1 * norm2); \
dp = min(dp, 1.0f); \
dp = max(dp, -1.0f); \
}
#define CROSS_PRODUCT(v1, v2, c) \
c.x = v1.y * v2.z - v1.z * v2.y; \
c.y = v1.z * v2.x - v1.x * v2.z; \
c.z = v1.x * v2.y - v1.y * v2.x;
#define GETPREFACTORSGIVENANGLECOSINE(cosine, param, dEdR) \
{ \
float angle = acos(cosine); \
float deltaIdeal = angle - (param.x * (3.14159265f / 180.0f)); \
dEdR = param.y * deltaIdeal; \
}
#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
{ \
float dp; \
GETNORMEDDOTPRODUCT(v1, v2, dp); \
angle = acos(dp); \
}
#define GETANGLECOSINEBETWEENTWOVECTORS(v1, v2, angle, cosine) \
{ \
GETNORMEDDOTPRODUCT(v1, v2, cosine); \
angle = acos(cosine); \
}
#define GETDIHEDRALANGLEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle) \
{ \
CROSS_PRODUCT(vector1, vector2, cp0); \
CROSS_PRODUCT(vector2, vector3, cp1); \
GETANGLEBETWEENTWOVECTORS(cp0, cp1, angle); \
float dp = DOT3(signVector, cp1); \
angle = (dp >= 0) ? angle : -angle; \
}
#define GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle, cosine) \
{ \
CROSS_PRODUCT(vector1, vector2, cp0); \
CROSS_PRODUCT(vector2, vector3, cp1); \
GETANGLECOSINEBETWEENTWOVECTORS(cp0, cp1, angle, cosine); \
float dp = DOT3(signVector, cp1); \
angle = (dp >= 0) ? angle : -angle; \
}
// Calculate Local forces on CPU
extern "C"
void kCPUCalculateLocalForces(gpuContext gpu)
{
gpu->psPosq4->Download();
gpu->psForce4->Download();
gpu->psBondID->Download();
gpu->psBondParameter->Download();
gpu->psBondAngleID1->Download();
gpu->psBondAngleID2->Download();
gpu->psBondAngleParameter->Download();
gpu->psDihedralID1->Download();
gpu->psDihedralID2->Download();
gpu->psDihedralParameter->Download();
gpu->psRbDihedralID1->Download();
gpu->psRbDihedralID2->Download();
gpu->psRbDihedralParameter1->Download();
gpu->psRbDihedralParameter2->Download();
gpu->psLJ14ID->Download();
gpu->psLJ14Parameter->Download();
unsigned int pos = 0;
Vectors V;
Vectors* A = &V;
int violations = 0;
while (pos < gpu->sim.bond_offset)
{
if (pos < gpu->sim.bonds)
{
int4 atom = gpu->psBondID->_pSysStream[0][pos];
float4 atomA = gpu->psPosq4->_pSysStream[0][atom.x];
float4 atomB = gpu->psPosq4->_pSysStream[0][atom.y];
float2 bond = gpu->psBondParameter->_pSysStream[0][pos];
float dx = atomB.x - atomA.x;
float dy = atomB.y - atomA.y;
float dz = atomB.z - atomA.z;
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrt(r2);
float deltaIdeal = r - bond.x;
float dEdR = bond.y * deltaIdeal;
dEdR = (r > 0.0f) ? (dEdR / r) : 0.0f;
if (fabs(deltaIdeal) > 1.0f)
{
printf("Bond %4d: %11.4f %11.4f %11.4f %11.4f %11.4f %11.4f\n", pos, dx, dy, dz, r, deltaIdeal, dEdR);
violations++;
}
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
unsigned int offsetA = atom.x + atom.z * gpu->sim.stride;
unsigned int offsetB = atom.y + atom.w * gpu->sim.stride;
float4 forceA = gpu->psForce4->_pSysStream[0][offsetA];
float4 forceB = gpu->psForce4->_pSysStream[0][offsetB];
forceA.x += dx;
forceA.y += dy;
forceA.z += dz;
forceB.x -= dx;
forceB.y -= dy;
forceB.z -= dz;
gpu->psForce4->_pSysStream[0][offsetA] = forceA;
gpu->psForce4->_pSysStream[0][offsetB] = forceB;
}
pos++;
}
#if 0
while (pos < gpu->sim.bond_angle_offset)
{
unsigned int pos1 = pos - gpu->sim.bond_offset;
if (pos1 < gpu->sim.bond_angles)
{
int4 atom1 = gpu->psBondAngleID1->_pSysStream[0][pos1];
float2 bond_angle = gpu->psBondAngleParameter->_pSysStream[0][pos1];
float4 a1 = gpu->psPosq4->_pSysStream[0][atom1.x];
float4 a2 = gpu->psPosq4->_pSysStream[0][atom1.y];
float4 a3 = gpu->psPosq4->_pSysStream[0][atom1.z];
A->v0.x = a2.x - a1.x;
A->v0.y = a2.y - a1.y;
A->v0.z = a2.z - a1.z;
A->v1.x = a2.x - a3.x;
A->v1.y = a2.y - a3.y;
A->v1.z = a2.z - a3.z;
float3 cp;
CROSS_PRODUCT(A->v0, A->v1, cp);
float rp = DOT3(cp, cp); //cx * cx + cy * cy + cz * cz;
rp = max(sqrt(rp), 1.0e-06f);
float r21 = DOT3(A->v0, A->v0); // dx1 * dx1 + dy1 * dy1 + dz1 * dz1;
float r23 = DOT3(A->v1, A->v1); // dx2 * dx2 + dy2 * dy2 + dz2 * dz2;
float dot = DOT3(A->v0, A->v1); // dx1 * dx2 + dy1 * dy2 + dz1 * dz2;
float cosine = dot / sqrt(r21 * r23);
float dEdR;
GETPREFACTORSGIVENANGLECOSINE(cosine, bond_angle, dEdR);
printf("Bond angle %4d %11.4f %11.4f\n", pos1, cosine, dEdR);
float termA = dEdR / (r21 * rp);
float termC = -dEdR / (r23 * rp);
float3 c21;
float3 c23;
CROSS_PRODUCT(A->v0, cp, c21);
CROSS_PRODUCT(A->v1, cp, c23);
c21.x *= termA;
c21.y *= termA;
c21.z *= termA;
c23.x *= termC;
c23.y *= termC;
c23.z *= termC;
int2 atom2 = gpu->psBondAngleID2->_pSysStream[0][pos1];
unsigned int offset = atom1.x + atom1.w * gpu->sim.stride;
float4 force = gpu->psForce4->_pSysStream[0][offset];
force.x += c21.x;
force.y += c21.y;
force.z += c21.z;
gpu->psForce4->_pSysStream[0][offset] = force;
offset = atom1.y + atom2.x * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
force.x -= (c21.x + c23.x);
force.y -= (c21.y + c23.y);
force.z -= (c21.z + c23.z);
gpu->psForce4->_pSysStream[0][offset] = force;
offset = atom1.z + atom2.y * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
force.x += c23.x;
force.y += c23.y;
force.z += c23.z;
gpu->psForce4->_pSysStream[0][offset] = force;
}
pos++;
}
while (pos < gpu->sim.dihedral_offset)
{
unsigned int pos1 = pos - gpu->sim.bond_angle_offset;
if (pos1 < gpu->sim.dihedrals)
{
int4 atom1 = gpu->psDihedralID1->_pSysStream[0][pos1];
float4 atomA = gpu->psPosq4->_pSysStream[0][atom1.x];
float4 atomB = gpu->psPosq4->_pSysStream[0][atom1.y];
float4 atomC = gpu->psPosq4->_pSysStream[0][atom1.z];
float4 atomD = gpu->psPosq4->_pSysStream[0][atom1.w];
A->v0.x = atomA.x - atomB.x;
A->v0.y = atomA.y - atomB.y;
A->v0.z = atomA.z - atomB.z;
A->v1.x = atomC.x - atomB.x;
A->v1.y = atomC.y - atomB.y;
A->v1.z = atomC.z - atomB.z;
A->v2.x = atomC.x - atomD.x;
A->v2.y = atomC.y - atomD.y;
A->v2.z = atomC.z - atomD.z;
float3 cp0, cp1;
float dihedralAngle;
GETDIHEDRALANGLEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle);
float4 dihedral = gpu->psDihedralParameter->_pSysStream[0][pos1];
float deltaAngle = dihedral.z * dihedralAngle - (dihedral.y * 3.14159265f / 180.0f);
float sinDeltaAngle = sin(deltaAngle);
float dEdAngle = -dihedral.x * dihedral.z * sinDeltaAngle;
float normCross1 = DOT3(cp0, cp0);
float normBC = sqrt(DOT3(A->v1, A->v1));
float4 ff;
ff.x = (-dEdAngle * normBC) / normCross1;
float normCross2 = DOT3(cp1, cp1);
ff.w = (dEdAngle * normBC) / normCross2;
float dp = 1.0f / DOT3(A->v1, A->v1);
ff.y = DOT3(A->v0, A->v1) * dp;
ff.z = DOT3(A->v2, A->v1) * dp;
int4 atom2 = gpu->psDihedralID2->_pSysStream[0][pos1];
float3 internalF0;
float3 internalF3;
float3 s;
// printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);
unsigned int offset = atom1.x + atom2.x * gpu->sim.stride;
float4 force = gpu->psForce4->_pSysStream[0][offset];
internalF0.x = ff.x * cp0.x;
force.x += internalF0.x;
internalF0.y = ff.x * cp0.y;
force.y += internalF0.y;
internalF0.z = ff.x * cp0.z;
force.z += internalF0.z;
gpu->psForce4->_pSysStream[0][offset] = force;
printf("Dihedral %4d - 0: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
offset = atom1.w + atom2.w * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
internalF3.x = ff.w * cp1.x;
force.x += internalF3.x;
internalF3.y = ff.w * cp1.y;
force.y += internalF3.y;
internalF3.z = ff.w * cp1.z;
force.z += internalF3.z;
gpu->psForce4->_pSysStream[0][offset] = force;
printf("Dihedral %4d - 3: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
s.x = ff.y * internalF0.x - ff.z * internalF3.x;
s.y = ff.y * internalF0.y - ff.z * internalF3.y;
s.z = ff.y * internalF0.z - ff.z * internalF3.z;
offset = atom1.y + atom2.y * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
force.x += -internalF0.x + s.x;
force.y += -internalF0.y + s.y;
force.z += -internalF0.z + s.z;
gpu->psForce4->_pSysStream[0][offset] = force;
printf("Dihedral %4d - 1: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
offset = atom1.z + atom2.z * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
force.x += -internalF3.x - s.x;
force.y += -internalF3.y - s.y;
force.z += -internalF3.z - s.z;
gpu->psForce4->_pSysStream[0][offset] = force;
printf("Dihedral %4d - 2: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
}
pos++;
}
while (pos < gpu->sim.rb_dihedral_offset)
{
unsigned int pos1 = pos - gpu->sim.dihedral_offset;
if (pos1 < gpu->sim.rb_dihedrals)
{
int4 atom1 = gpu->psRbDihedralID1->_pSysStream[0][pos1];
float4 atomA = gpu->psPosq4->_pSysStream[0][atom1.x];
float4 atomB = gpu->psPosq4->_pSysStream[0][atom1.y];
float4 atomC = gpu->psPosq4->_pSysStream[0][atom1.z];
float4 atomD = gpu->psPosq4->_pSysStream[0][atom1.w];
A->v0.x = atomA.x - atomB.x;
A->v0.y = atomA.y - atomB.y;
A->v0.z = atomA.z - atomB.z;
A->v1.x = atomC.x - atomB.x;
A->v1.y = atomC.y - atomB.y;
A->v1.z = atomC.z - atomB.z;
A->v2.x = atomC.x - atomD.x;
A->v2.y = atomC.y - atomD.y;
A->v2.z = atomC.z - atomD.z;
float3 cp0, cp1;
float dihedralAngle, cosPhi;
// printf("%4d - 0 : %9.4f %9.4f %9.4f\n", pos1, A->v0.x, A->v0.y, A->v0.z);
// printf("%4d - 1 : %9.4f %9.4f %9.4f\n", pos1, A->v1.x, A->v1.y, A->v1.z);
// printf("%4d - 2 : %9.4f %9.4f %9.4f\n", pos1, A->v2.x, A->v2.y, A->v2.z);
GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle, cosPhi);
if (dihedralAngle < 0.0f )
{
dihedralAngle += 3.14159265f;
}
else
{
dihedralAngle -= 3.14159265f;
}
cosPhi = -cosPhi;
// printf("%4d: %9.4f %9.4f\n", pos1, dihedralAngle, cosPhi);
float4 dihedral1 = gpu->psRbDihedralParameter1->_pSysStream[0][pos1];
float2 dihedral2 = gpu->psRbDihedralParameter2->_pSysStream[0][pos1];
float cosFactor = cosPhi;
float dEdAngle = -dihedral1.y;
// printf("%4d - 1: %9.4f %9.4f\n", pos1, dEdAngle, 1.0f);
dEdAngle -= 2.0f * dihedral1.z * cosFactor;
// printf("%4d - 2: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor *= cosPhi;
dEdAngle -= 3.0f * dihedral1.w * cosFactor;
// printf("%4d - 3: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor *= cosPhi;
dEdAngle -= 4.0f * dihedral2.x * cosFactor;
// printf("%4d - 4: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor *= cosPhi;
dEdAngle -= 5.0f * dihedral2.y * cosFactor;
// printf("%4d - 5: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
dEdAngle *= sin(dihedralAngle);
// printf("%4d - f: %9.4f\n", pos1, dEdAngle);
float normCross1 = DOT3(cp0, cp0);
float normBC = sqrt(DOT3(A->v1, A->v1));
float4 ff;
ff.x = (-dEdAngle * normBC) / normCross1;
float normCross2 = DOT3(cp1, cp1);
ff.w = (dEdAngle * normBC) / normCross2;
float dp = 1.0f / DOT3(A->v1, A->v1);
ff.y = DOT3(A->v0, A->v1) * dp;
ff.z = DOT3(A->v2, A->v1) * dp;
int4 atom2 = gpu->psRbDihedralID2->_pSysStream[0][pos1];
float3 internalF0;
float3 internalF3;
float3 s;
printf("RB Dihedral %4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);
unsigned int offset = atom1.x + atom2.x * gpu->sim.stride;
float4 force = gpu->psForce4->_pSysStream[0][offset];
internalF0.x = ff.x * cp0.x;
force.x += internalF0.x;
internalF0.y = ff.x * cp0.y;
force.y += internalF0.y;
internalF0.z = ff.x * cp0.z;
force.z += internalF0.z;
gpu->psForce4->_pSysStream[0][offset] = force;
printf("RB Dihedral %4d - 0: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
offset = atom1.w + atom2.w * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
internalF3.x = ff.w * cp1.x;
force.x += internalF3.x;
internalF3.y = ff.w * cp1.y;
force.y += internalF3.y;
internalF3.z = ff.w * cp1.z;
force.z += internalF3.z;
gpu->psForce4->_pSysStream[0][offset] = force;
printf("RB Dihedral %4d - 3: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
s.x = ff.y * internalF0.x - ff.z * internalF3.x;
s.y = ff.y * internalF0.y - ff.z * internalF3.y;
s.z = ff.y * internalF0.z - ff.z * internalF3.z;
offset = atom1.y + atom2.y * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
force.x += -internalF0.x + s.x;
force.y += -internalF0.y + s.y;
force.z += -internalF0.z + s.z;
gpu->psForce4->_pSysStream[0][offset] = force;
printf("RB Dihedral %4d - 1: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
offset = atom1.z + atom2.z * gpu->sim.stride;
force = gpu->psForce4->_pSysStream[0][offset];
force.x += -internalF3.x - s.x;
force.y += -internalF3.y - s.y;
force.z += -internalF3.z - s.z;
gpu->psForce4->_pSysStream[0][offset] = force;
// printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, gpu->psForce4->_pSysStream[0][offset], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride], gpu->psForce4->_pSysStream[0][offset + gpu->sim.stride2]);
}
pos++;
}
while (pos < gpu->sim.LJ14_offset)
{
unsigned int pos1 = pos - gpu->sim.rb_dihedral_offset;
if (pos1 < gpu->sim.LJ14s)
{
int4 atom = gpu->psLJ14ID->_pSysStream[0][pos1];
float4 LJ14 = gpu->psLJ14Parameter->_pSysStream[0][pos1];
float4 a1 = gpu->psPosq4->_pSysStream[0][atom.x];
float4 a2 = gpu->psPosq4->_pSysStream[0][atom.y];
float3 d;
d.x = a1.x - a2.x;
d.y = a1.y - a2.y;
d.z = a1.z - a2.z;
float r2 = DOT3(d, d);
float inverseR = 1.0f / sqrt(r2);
float sig2 = inverseR * LJ14.y;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
dEdR += LJ14.z * inverseR;
dEdR *= inverseR * inverseR;
unsigned int offsetA = atom.x + atom.z * gpu->sim.stride;
unsigned int offsetB = atom.y + atom.w * gpu->sim.stride;
float4 forceA = gpu->psForce4->_pSysStream[0][offsetA];
float4 forceB = gpu->psForce4->_pSysStream[0][offsetB];
d.x *= dEdR;
d.y *= dEdR;
d.z *= dEdR;
forceA.x += d.x;
forceA.y += d.y;
forceA.z += d.z;
forceB.x -= d.x;
forceB.y -= d.y;
forceB.z -= d.z;
printf("LJ14 %d: %11.4f %11.4f %11.4f\n", pos1, d.x, d.y, d.z);
gpu->psForce4->_pSysStream[0][offsetA] = forceA;
gpu->psForce4->_pSysStream[0][offsetB] = forceB;
}
pos++;
}
#endif
if (violations > 0)
{
gpuDumpCoordinates(gpu);
gpuDumpForces(gpu);
}
}
static FILE* getWriteToFilePtr( char* fname, int step )
{
std::stringstream fileName;
fileName << fname << "_";
fileName << step;
fileName << ".txt";
FILE* filePtr = fopen( fileName.str().c_str(), "w" );
if( filePtr == NULL ){
(void) fprintf( stderr, "Could not open file=<%s> for writitng.", fileName.str().c_str() );
exit(-1);
}
return filePtr;
}
extern "C" {
static void printValues( FILE* filePtr, int index, int numberOfValues, float* values )
{
int i;
(void) fprintf( filePtr, "%5d ", index );
for ( i = 0; i < numberOfValues; i++ ) {
(void) fprintf( filePtr, " %18.10e", values[i] );
}
(void) fprintf( filePtr, "\n" );
(void) fflush( filePtr );
}
}
extern "C"
void WriteArrayToFile1( gpuContext gpu, char* fname, int step, CUDAStream<float>* psPos, int numPrint )
{
int i;
static const int numberOfValues = 1;
FILE* filePtr = getWriteToFilePtr( fname, step );
float values[numberOfValues];
psPos->Download();
numPrint = (numPrint > 0 && (numPrint < gpu->natoms)) ? numPrint : gpu->natoms;
for ( i = 0; i < numPrint; i++ ) {
values[0] = psPos->_pSysStream[0][i];
printValues( filePtr, i, numberOfValues, values );
}
for ( i = gpu->natoms - numPrint; i < gpu->natoms; i++ ) {
values[0] = psPos->_pSysStream[0][i];
printValues( filePtr, i, numberOfValues, values );
}
(void) fclose( filePtr );
}
extern "C"
void WriteArrayToFile2( gpuContext gpu, char* fname, int step, CUDAStream<float2>* psPos, int numPrint )
{
int i;
static const int numberOfValues = 2;
FILE* filePtr = getWriteToFilePtr( fname, step );
float values[numberOfValues];
psPos->Download();
numPrint = (numPrint > 0 && (numPrint < gpu->natoms)) ? numPrint : gpu->natoms;
for ( i = 0; i < numPrint; i++ ) {
values[0] = psPos->_pSysStream[0][i].x;
values[1] = psPos->_pSysStream[0][i].y;
printValues( filePtr, i, numberOfValues, values );
}
for ( i = gpu->natoms - numPrint; i < gpu->natoms; i++ ) {
values[0] = psPos->_pSysStream[0][i].x;
values[1] = psPos->_pSysStream[0][i].y;
printValues( filePtr, i, numberOfValues, values );
}
(void) fclose( filePtr );
}
extern "C"
void WriteArrayToFile4( gpuContext gpu, char* fname, int step, CUDAStream<float4>* psPos, int numPrint )
{
int i;
static const int numberOfValues = 4;
FILE* filePtr = getWriteToFilePtr( fname, step );
float values[numberOfValues];
psPos->Download();
numPrint = (numPrint > 0 && (numPrint < gpu->natoms)) ? numPrint : gpu->natoms;
for ( i = 0; i < numPrint; i++ ) {
values[0] = psPos->_pSysStream[0][i].x;
values[1] = psPos->_pSysStream[0][i].y;
values[2] = psPos->_pSysStream[0][i].z;
values[3] = psPos->_pSysStream[0][i].w;
printValues( filePtr, i, numberOfValues, values );
}
for ( i = gpu->natoms - numPrint; i < gpu->natoms; i++ ) {
values[0] = psPos->_pSysStream[0][i].x;
values[1] = psPos->_pSysStream[0][i].y;
values[2] = psPos->_pSysStream[0][i].z;
values[3] = psPos->_pSysStream[0][i].w;
printValues( filePtr, i, numberOfValues, values );
}
(void) fclose( filePtr );
}
extern "C"
void gpuDumpObcInfo(gpuContext gpu)
{
gpu->psPosq4->Download();
gpu->psBornRadii->Download();
gpu->psObcData->Download();
gpu->psBornSum->Download();
printf( "\n\nObc Info xyzw Brad atomR scaledAtomR\n" );
for (int i = 0; i < gpu->natoms; i++)
{
printf("%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i,
gpu->psPosq4->_pSysStream[0][i].x,
gpu->psPosq4->_pSysStream[0][i].y,
gpu->psPosq4->_pSysStream[0][i].z,
gpu->psPosq4->_pSysStream[0][i].w,
gpu->psBornRadii->_pSysStream[0][i],
gpu->psBornSum->_pSysStream[0][i],
gpu->psObcData->_pSysStream[0][i].x,
gpu->psObcData->_pSysStream[0][i].y
);
}
}
extern "C"
void gpuDumpObcLoop1(gpuContext gpu)
{
float compF;
gpu->psForce4->Download();
gpu->psBornRadii->Download();
gpu->psBornForce->Download();
gpu->psObcChain->Download();
gpu->psBornSum->Download();
printf( "\n\nObc F3 BrnR BrnF Chn\n" );
for (int i = 0; i < gpu->natoms; i++)
{
compF = gpu->psBornForce->_pSysStream[0][i]/(gpu->psBornRadii->_pSysStream[0][i]*gpu->psBornRadii->_pSysStream[0][i]*gpu->psObcChain->_pSysStream[0][i]);
printf("%4d: %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i,
gpu->psForce4->_pSysStream[0][i].x,
gpu->psForce4->_pSysStream[0][i].y,
gpu->psForce4->_pSysStream[0][i].z,
// gpu->psForce4->_pSysStream[0][i].w,
gpu->psBornRadii->_pSysStream[0][i],
compF,
gpu->psBornForce->_pSysStream[0][i],
// gpu->psBornSum->_pSysStream[0][i],
gpu->psObcChain->_pSysStream[0][i]
);
}
}
#ifndef __GPUTYPES_H__
#define __GPUTYPES_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "cudatypes.h"
#include <vector>
struct gpuAtomType {
string name;
char symbol;
float r;
};
enum SM_VERSION
{
SM_10,
SM_11,
SM_12
};
/* Pointer to this structure will be given
* to gromacs functions*/
struct _gpuContext {
//Cache this here so that it doesn't
//have to be repeatedly passed around
int natoms;
gpuAtomType* gpAtomTable;
int gAtomTypes;
cudaGmxSimulation sim;
unsigned int* pOutputBufferCounter;
unsigned int* pExclusion;
unsigned char* pAtomSymbol;
float iterations;
float epsfac;
float solventDielectric;
float soluteDielectric;
int grid;
bool bCalculateCM;
bool bRemoveCM;
bool bRecalculateBornRadii;
unsigned long seed;
SM_VERSION sm_version;
CUDAStream<float4>* psPosq4;
CUDAStream<float4>* psPosqP4;
CUDAStream<float4>* psOldPosq4;
CUDAStream<float4>* psVelm4;
CUDAStream<float4>* psForce4;
CUDAStream<float4>* psxVector4;
CUDAStream<float4>* psvVector4;
CUDAStream<float2>* psSigEps2;
CUDAStream<float2>* psObcData;
CUDAStream<float>* psObcChain;
CUDAStream<float>* psBornForce;
CUDAStream<float>* psBornRadii;
CUDAStream<float>* psBornSum;
CUDAStream<int4>* psBondID;
CUDAStream<float2>* psBondParameter;
CUDAStream<int4>* psBondAngleID1;
CUDAStream<int2>* psBondAngleID2;
CUDAStream<float2>* psBondAngleParameter;
CUDAStream<int4>* psDihedralID1;
CUDAStream<int4>* psDihedralID2;
CUDAStream<float4>* psDihedralParameter;
CUDAStream<int4>* psRbDihedralID1;
CUDAStream<int4>* psRbDihedralID2;
CUDAStream<float4>* psRbDihedralParameter1;
CUDAStream<float2>* psRbDihedralParameter2;
CUDAStream<int4>* psLJ14ID;
CUDAStream<float4>* psLJ14Parameter;
CUDAStream<int>* psNonShakeID;
CUDAStream<int4>* psShakeID;
CUDAStream<float4>* psShakeParameter;
CUDAStream<unsigned int>* psExclusion;
CUDAStream<unsigned int>* psWorkUnit;
CUDAStream<float4>* psRandom4; // Pointer to sets of 4 random numbers for MD integration
CUDAStream<float2>* psRandom2; // Pointer to sets of 2 random numbers for MD integration
CUDAStream<uint4>* psRandomSeed; // Pointer to each random seed
CUDAStream<int>* psRandomPosition; // Pointer to random number positions
CUDAStream<float4>* psLinearMomentum; // Pointer to total linear momentum per CTA
};
typedef struct _gpuContext *gpuContext;
// Function prototypes
extern "C"
bool gpuIsAvailable();
extern "C"
int gpuReadBondParameters(gpuContext gpu, char* fname);
extern "C"
void gpuSetBondParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<float>& length, const std::vector<float>& k);
extern "C"
int gpuReadBondAngleParameters(gpuContext gpu, char* fname);
extern "C"
void gpuSetBondAngleParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3,
const std::vector<float>& angle, const std::vector<float>& k);
extern "C"
int gpuReadDihedralParameters(gpuContext gpu, char* fname);
extern "C"
void gpuSetDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4,
const std::vector<float>& k, const std::vector<float>& phase, const std::vector<int>& periodicity);
extern "C"
int gpuReadRbDihedralParameters(gpuContext gpu, char* fname);
extern "C"
void gpuSetRbDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4,
const std::vector<float>& c0, const std::vector<float>& c1, const std::vector<float>& c2, const std::vector<float>& c3, const std::vector<float>& c4, const std::vector<float>& c5);
extern "C"
int gpuReadLJ14Parameters(gpuContext gpu, char* fname);
extern "C"
void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const std::vector<int>& atom1, const std::vector<int>& atom2,
const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q1, const std::vector<float>& q2);
extern "C"
float gpuGetAtomicRadius(gpuContext gpu, string s);
extern "C"
unsigned char gpuGetAtomicSymbol(gpuContext gpu, string s);
extern "C"
int gpuReadAtomicParameters(gpuContext gpu, char* fname);
extern "C"
int gpuReadCoulombParameters(gpuContext gpu, char* fname);
extern "C"
void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q,
const std::vector<char>& symbol, const std::vector<vector<int> >& exclusions);
extern "C"
void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom, const std::vector<float>& radius, const std::vector<float>& scale);
extern "C"
int gpuReadShakeParameters(gpuContext gpu, char* fname);
extern "C"
void gpuSetShakeParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<float>& distance,
const std::vector<float>& invMass1, const std::vector<float>& invMass2, float tolerance);
extern "C"
int gpuAllocateInitialBuffers(gpuContext gpu);
extern "C"
void gpuReadCoordinates(gpuContext gpu, char* fname);
extern "C"
void gpuSetPositions(gpuContext gpu, const std::vector<float>& x, const std::vector<float>& y, const std::vector<float>& z);
extern "C"
void gpuSetVelocities(gpuContext gpu, const std::vector<float>& x, const std::vector<float>& y, const std::vector<float>& z);
extern "C"
void gpuSetMass(gpuContext gpu, const std::vector<float>& mass);
extern "C"
void gpuInitializeRandoms(gpuContext gpu);
extern "C"
void* gpuInitFromFile(char* fname);
extern "C"
void* gpuInit(int numAtoms);
extern "C"
void gpuSetIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature);
extern "C"
void gpuSetVerletIntegrationParameters(gpuContext gpu, float deltaT);
extern "C"
void gpuSetBrownianIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature);
extern "C"
void gpuSetAndersenThermostatParameters(gpuContext gpu, float temperature, float collisionProbability);
extern "C"
void gpuShutDown(gpuContext gpu);
extern "C"
int gpuBuildOutputBuffers(gpuContext gpu);
extern "C"
int gpuBuildThreadBlockWorkList(gpuContext gpu);
extern "C"
int gpuBuildExclusionList(gpuContext gpu);
extern "C"
int gpuSetConstants(gpuContext gpu);
extern "C"
void gpuDumpCoordinates(gpuContext gpu);
extern "C"
void gpuDumpPrimeCoordinates(gpuContext gpu);
extern "C"
void gpuDumpForces(gpuContext gpu);
extern "C"
void gpuDumpAtomData(gpuContext gpu);
extern "C"
bool gpuCheckData(gpuContext gpu);
extern "C"
void gpuSetup(void* pVoid);
extern "C"
void kCPUCalculate14(gpuContext gpu);
extern "C"
void kCPUCalculateLocalForces(gpuContext gpu);
extern "C"
void WriteArrayToFile1( gpuContext gpu, char* fname, int step, CUDAStream<float>* psPos, int numPrint );
extern "C"
void WriteArrayToFile2( gpuContext gpu, char* fname, int step, CUDAStream<float2>* psPos, int numPrint );
extern "C"
void WriteArrayToFile3( gpuContext gpu, char* fname, int step, CUDAStream<float3>* psPos, int numPrint );
extern "C"
void WriteArrayToFile4( gpuContext gpu, char* fname, int step, CUDAStream<float4>* psPos, int numPrint );
extern "C"
void gpuDumpObcInfo(gpuContext gpu);
extern "C"
void gpuDumpObcLoop1(gpuContext gpu);
#endif //__GPUTYPES_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using namespace std;
#include "gputypes.h"
#define DeltaShake
static __constant__ cudaGmxSimulation cSim;
void SetBrownianUpdateSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetBrownianUpdateSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kBrownianUpdatePart1_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int rpos = cSim.pRandomPosition[blockIdx.x];
__syncthreads();
while (pos < cSim.atoms)
{
float4 random4a = cSim.pRandom4a[rpos + pos];
float4 apos = cSim.pPosq[pos];
float4 force = cSim.pForce4[pos];
cSim.pOldPosq[pos] = apos;
#ifndef DeltaShake
apos.x += force.x*cSim.GDT + random4a.x;
apos.y += force.y*cSim.GDT + random4a.y;
apos.z += force.z*cSim.GDT + random4a.z;
#else
apos.x = force.x*cSim.GDT + random4a.x;
apos.y = force.y*cSim.GDT + random4a.y;
apos.z = force.z*cSim.GDT + random4a.z;
#endif
cSim.pPosqP[pos] = apos;
pos += blockDim.x * gridDim.x;
}
}
void kBrownianUpdatePart1(gpuContext gpu)
{
// printf("kBrownianUpdatePart1\n");
kBrownianUpdatePart1_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kBrownianUpdatePart1");
}
__global__ void kBrownianUpdatePart2_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int rpos = cSim.pRandomPosition[blockIdx.x];
__syncthreads();
while (pos < cSim.atoms)
{
float4 velocity = cSim.pVelm4[pos];
float4 apos = cSim.pPosq[pos];
float4 xPrime = cSim.pPosqP[pos];
#ifndef DeltaShake
velocity.x = cSim.oneOverDeltaT*(xPrime.x-apos.x);
velocity.y = cSim.oneOverDeltaT*(xPrime.y-apos.y);
velocity.z = cSim.oneOverDeltaT*(xPrime.z-apos.z);
#else
velocity.x = cSim.oneOverDeltaT*(xPrime.x);
velocity.y = cSim.oneOverDeltaT*(xPrime.y);
velocity.z = cSim.oneOverDeltaT*(xPrime.z);
xPrime.x += apos.x;
xPrime.y += apos.y;
xPrime.z += apos.z;
#endif
cSim.pPosq[pos] = xPrime;
cSim.pVelm4[pos] = velocity;
pos += blockDim.x * gridDim.x;
}
// Update random position pointer
if (threadIdx.x == 0)
{
rpos += cSim.paddedNumberOfAtoms;
if (rpos > cSim.randoms)
rpos -= cSim.randoms;
cSim.pRandomPosition[blockIdx.x] = rpos;
}
}
extern void kGenerateRandoms(gpuContext gpu);
void kBrownianUpdatePart2(gpuContext gpu)
{
// printf("kBrownianUpdatePart2\n");
kBrownianUpdatePart2_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kBrownianUpdatePart2");
// Update randoms if necessary
static int iteration = 0;
iteration++;
if (iteration == gpu->sim.randomIterations)
{
kGenerateRandoms(gpu);
iteration = 0;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using namespace std;
#include "gputypes.h"
static __constant__ cudaGmxSimulation cSim;
void SetCalculateAndersenThermostatSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateAndersenThermostatSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kCalculateAndersenThermostat_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int rpos = cSim.pRandomPosition[blockIdx.x];
__syncthreads();
while (pos < cSim.atoms)
{
float4 velocity = cSim.pVelm4[pos];
float4 random4a = cSim.pRandom4a[rpos + pos];
float scale = (random4a.w < cSim.collisionProbability ? 0.0 : 1.0);
float add = (1.0-scale)*sqrt(cSim.kT*velocity.w);
velocity.x = scale*velocity.x + add*random4a.x;
velocity.y = scale*velocity.y + add*random4a.y;
velocity.z = scale*velocity.z + add*random4a.z;
cSim.pVelm4[pos] = velocity;
pos += blockDim.x * gridDim.x;
}
// Update random position pointer
if (threadIdx.x == 0)
{
rpos += cSim.paddedNumberOfAtoms;
if (rpos > cSim.randoms)
rpos -= cSim.randoms;
cSim.pRandomPosition[blockIdx.x] = rpos;
}
}
extern void kGenerateRandoms(gpuContext gpu);
void kCalculateAndersenThermostat(gpuContext gpu)
{
// printf("kCalculateAndersenThermostat\n");
kCalculateAndersenThermostat_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kCalculateAndersenThermostat");
// Update randoms if necessary
static int iteration = 0;
iteration++;
if (iteration == gpu->sim.randomIterations)
{
kGenerateRandoms(gpu);
iteration = 0;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
#include "cudatypes.h"
#define UNROLLXX 0
#define UNROLLXY 0
struct Atom {
float x;
float y;
float z;
float q;
float sig;
float eps;
float fx;
float fy;
float fz;
float eps2;
float sig2;
};
__shared__ Atom sA[G8X_NONBOND_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim;
void SetCalculateCDLJForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateCDLJForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kCalculateCDLJForces_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
}
__syncthreads();
// Now change pos and end to reflect work queue just read
// into shared memory
end = end - pos;
pos = end - (threadIdx.x >> GRIDBITS) - 1;
while (pos >= 0)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
bool bExclusionFlag = (x & 0x1);
x = (x >> 17) << GRIDBITS;
float4 apos; // Local atom x, y, z, q
float3 af; // Local atom fx, fy, fz
float dx;
float dy;
float dz;
float r2;
float invR;
float sig;
float sig2;
float sig6;
float eps;
float dEdR;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
if (!bExclusionFlag)
{
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int i = x + tgx;
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = apos.w;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
apos.w *= cSim.epsfac;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = a.x + psA[j].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
}
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
sA[threadIdx.x].sig2 = a.x;
sA[threadIdx.x].eps2 = a.y;
apos.w *= cSim.epsfac;
for (j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = a.x + psA[tj].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[tj].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
tj = sNext[tj];
}
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
}
}
else // bExclusion
{
// Read exclusion data
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
unsigned int i = x + tgx;
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = apos.w;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
sA[threadIdx.x].sig2 = a.x;
sA[threadIdx.x].eps2 = a.y;
apos.w *= cSim.epsfac;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = psA[tgx].sig2 + psA[j].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = psA[tgx].eps2 * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
{
dEdR = 0.0f;
}
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
excl >>= 1;
}
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
excl = (excl >> tgx) | (excl << (GRID - tgx));
int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
sA[threadIdx.x].sig2 = a.x;
sA[threadIdx.x].eps2 = a.y;
apos.w *= cSim.epsfac;
for (j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = psA[tgx].sig2 + psA[tj].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = psA[tgx].eps2 * psA[tj].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
{
dEdR = 0.0f;
}
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
excl >>= 1;
tj = sNext[tj];
}
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
}
}
pos -= cSim.nonbond_workBlock;
}
}
__global__ extern void kCalculateCDLJForces_12_kernel();
void kCalculateCDLJForces(gpuContext gpu)
{
// printf("kCalculateCDLJForces\n");
if (gpu->sm_version < SM_12)
kCalculateCDLJForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
else
kCalculateCDLJForces_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kCalculateCDLJForces");
}
\ No newline at end of file
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
#include "cudatypes.h"
#define UNROLLXX 0
#define UNROLLXY 0
struct Atom {
float x;
float y;
float z;
float q;
float sig;
float eps;
float fx;
float fy;
float fz;
};
__shared__ Atom sA[GT2XX_NONBOND_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim;
void SetCalculateCDLJForces_12Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateCDLJForces_12Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kCalculateCDLJForces_12_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
}
__syncthreads();
// Now change pos and end to reflect work queue just read
// into shared memory
end = end - pos;
pos = end - (threadIdx.x >> GRIDBITS) - 1;
while (pos >= 0)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
bool bExclusionFlag = (x & 0x1);
x = (x >> 17) << GRIDBITS;
float4 apos; // Local atom x, y, z, q
float3 af; // Local atom fx, fy, fz
float dx;
float dy;
float dz;
float r2;
float invR;
float sig;
float sig2;
float sig6;
float eps;
float dEdR;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
if (!bExclusionFlag)
{
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int i = x + tgx;
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = apos.w;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
apos.w *= cSim.epsfac;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = a.x + psA[j].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
}
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
apos.w *= cSim.epsfac;
for (j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = a.x + psA[tj].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[tj].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
tj = sNext[tj];
}
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
}
}
else // bExclusion
{
// Read exclusion data
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
unsigned int i = x + tgx;
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = apos.w;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
apos.w *= cSim.epsfac;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = a.x + psA[j].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[j].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
{
dEdR = 0.0f;
}
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
excl >>= 1;
}
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
excl = (excl >> tgx) | (excl << (GRID - tgx));
int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
apos.w *= cSim.epsfac;
for (j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrt(r2);
sig = a.x + psA[tj].sig;
sig2 = invR * sig;
sig2 *= sig2;
sig6 = sig2 * sig2 * sig2;
eps = a.y * psA[tj].eps;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
{
dEdR = 0.0f;
}
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
excl >>= 1;
tj = sNext[tj];
}
// Write results
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = of;
}
}
pos -= cSim.nonbond_workBlock;
}
}
void kCalculateCDLJForces_12(gpuContext gpu)
{
// printf("kCalculateCDLJForces_12\n");
kCalculateCDLJForces_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kCalculateCDLJForces_12");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
#include "cudatypes.h"
#include "cudaKernels.h"
struct Atom {
float x;
float y;
float z;
float q;
float sig;
float eps;
float br;
float fx;
float fy;
float fz;
float fb;
float q2;
float junk;
};
__shared__ Atom sA[G8X_NONBOND_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim;
void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kCalculateCDLJObcGbsaForces1_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
}
__syncthreads();
// Now change pos and end to reflect work queue just read
// into shared memory
end = end - pos;
pos = end - (threadIdx.x >> GRIDBITS) - 1;
while (pos >= 0)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
bool bExclusionFlag = (x & 0x1);
x = (x >> 17) << GRIDBITS;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
float br = cSim.pBornRadii[i];
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
if (!bExclusionFlag)
{
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = cSim.epsfac * apos.w;
sA[threadIdx.x].q2 = cSim.preFactor * apos.w;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
sA[threadIdx.x].br = br;
float4 af;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
af.w = 0.0f;
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[j].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[j].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[j].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (apos.w * psA[j].q2) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[j].br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// Add Forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
sA[threadIdx.x].br = cSim.pBornRadii[j];
float4 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
sA[threadIdx.x].fb = af.w = 0.0f;
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = cSim.epsfac * temp.w;
sA[threadIdx.x].q2 = cSim.preFactor * temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
for (j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[tj].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[tj].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[tj].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (apos.w * psA[tj].q2) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[tj].br;
psA[tj].fb += dGpol_dalpha2_ij * br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// Add forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
tj = sNext[tj];
}
// Write results
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
af.x = sA[threadIdx.x].fx;
af.y = sA[threadIdx.x].fy;
af.z = sA[threadIdx.x].fz;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = sA[threadIdx.x].fb;
}
}
else // bExclusion
{
// Read exclusion data
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
float4 af;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
af.w = 0.0f;
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = cSim.epsfac * apos.w;
sA[threadIdx.x].q2 = cSim.preFactor * apos.w;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
sA[threadIdx.x].br = br;
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[j].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[j].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
{
dEdR = 0.0f;
}
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[j].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (apos.w * psA[j].q2) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[j].br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// Add Forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
excl >>= 1;
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
float4 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
sA[threadIdx.x].fb = af.w = 0.0f;
int j = y + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
sA[threadIdx.x].br = cSim.pBornRadii[j];
excl = (excl >> tgx) | (excl << (GRID - tgx));
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = cSim.epsfac * temp.w;
sA[threadIdx.x].q2 = cSim.preFactor * temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
for (j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[tj].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[tj].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
{
dEdR = 0.0f;
}
//float dEdR = 0.0f;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[tj].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (apos.w * psA[tj].q2) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[tj].br;
psA[tj].fb += dGpol_dalpha2_ij * br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// Add forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
excl >>= 1;
tj = sNext[tj];
}
// Write results
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
af.x = sA[threadIdx.x].fx;
af.y = sA[threadIdx.x].fy;
af.z = sA[threadIdx.x].fz;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = sA[threadIdx.x].fb;
}
}
pos -= cSim.nonbond_workBlock;
}
}
__global__ extern void kCalculateCDLJObcGbsaForces1_12_kernel();
void kCalculateCDLJObcGbsaForces1(gpuContext gpu)
{
//printf("In kCalculateCDLJObcGbsaForces1 QQQ\n");
// check if Born radii need to be calculated
if( gpu->bRecalculateBornRadii ){
kCalculateObcGbsaBornSum(gpu);
kReduceObcGbsaBornSum(gpu);
}
if (gpu->sm_version < SM_12)
kCalculateCDLJObcGbsaForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
else
kCalculateCDLJObcGbsaForces1_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
if( 0 ){
static int step = 0;
// int numPrint = -1;
step++;
//WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
//gpuDumpCoordinates( gpu );
kReduceBornSumAndForces( gpu );
gpuDumpObcLoop1( gpu );
}
LAUNCHERROR("kCalculateCDLJObcGbsaForces1");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
#include "cudatypes.h"
#define UNROLLXX 0
#define UNROLLXY 0
struct Atom {
float x;
float y;
float z;
float q;
float sig;
float eps;
float br;
float fx;
float fy;
float fz;
float fb;
};
__shared__ Atom sA[GT2XX_NONBOND_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim;
void SetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateCDLJObcGbsaForces1_12Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kCalculateCDLJObcGbsaForces1_12_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
}
__syncthreads();
// Now change pos and end to reflect work queue just read
// into shared memory
end = end - pos;
pos = end - (threadIdx.x >> GRIDBITS) - 1;
while (pos >= 0)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
bool bExclusionFlag = (x & 0x1);
x = (x >> 17) << GRIDBITS;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i];
float2 a = cSim.pAttr[i];
float br = cSim.pBornRadii[i];
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
if (!bExclusionFlag)
{
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = apos.w;
float q2 = cSim.preFactor * apos.w;
apos.w *= cSim.epsfac;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
sA[threadIdx.x].br = br;
float4 af;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
af.w = 0.0f;
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[j].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[j].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[j].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (q2 * psA[j].q) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[j].br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// Add Forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
sA[threadIdx.x].br = cSim.pBornRadii[j];
float4 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
sA[threadIdx.x].fb = af.w = 0.0f;
float q2 = apos.w * cSim.preFactor;
apos.w *= cSim.epsfac;
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
for (j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[tj].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[tj].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[tj].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (q2 * psA[tj].q) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[tj].br;
psA[tj].fb += dGpol_dalpha2_ij * br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// Add forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
tj = sNext[tj];
}
// Write results
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
af.x = sA[threadIdx.x].fx;
af.y = sA[threadIdx.x].fy;
af.z = sA[threadIdx.x].fz;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = sA[threadIdx.x].fb;
}
}
else // bExclusion
{
// Read exclusion data
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
float4 af;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
af.w = 0.0f;
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = apos.w;
float q2 = cSim.preFactor * apos.w;
apos.w *= cSim.epsfac;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
sA[threadIdx.x].br = br;
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[j].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[j].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[j].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
{
dEdR = 0.0f;
}
// ObcGbsaForce1 part
float alpha2_ij = br * psA[j].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (q2 * psA[j].q) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[j].br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// Add Forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
excl >>= 1;
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
unsigned int excl = cSim.pExclusion[x * cSim.exclusionStride + y + tgx];
float4 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
sA[threadIdx.x].fb = af.w = 0.0f;
int j = y + tgx;
float q2 = cSim.preFactor * apos.w;
apos.w *= cSim.epsfac;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pAttr[j];
sA[threadIdx.x].br = cSim.pBornRadii[j];
excl = (excl >> tgx) | (excl << (GRID - tgx));
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
for (j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
// CDLJ part
float invR = 1.0f / sqrt(r2);
float sig = a.x + psA[tj].sig;
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float eps = a.y * psA[tj].eps;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
dEdR += apos.w * psA[tj].q * invR;
dEdR *= invR * invR;
if (!(excl & 0x1))
{
dEdR = 0.0f;
}
// ObcGbsaForce1 part
float alpha2_ij = br * psA[tj].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (q2 * psA[tj].q) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
af.w += dGpol_dalpha2_ij * psA[tj].br;
psA[tj].fb += dGpol_dalpha2_ij * br;
dEdR += Gpol * (1.0f - 0.25f * expTerm);
// Add forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
excl >>= 1;
tj = sNext[tj];
}
// Write results
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
af.x = sA[threadIdx.x].fx;
af.y = sA[threadIdx.x].fy;
af.z = sA[threadIdx.x].fz;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = sA[threadIdx.x].fb;
}
}
pos -= cSim.nonbond_workBlock;
}
}
void kCalculateCDLJObcGbsaForces1_12(gpuContext gpu)
{
// printf("kCalculateCDLJObcGbsaForces1_12\n");
kCalculateCDLJObcGbsaForces1_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kCalculateCDLJObcGbsaForces1_12");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
extern __shared__ Vectors sV[];
static __constant__ cudaGmxSimulation cSim;
#define DOT3(v1, v2) (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z)
#define GETNORMEDDOTPRODUCT(v1, v2, dp) \
{ \
dp = DOT3(v1, v2); \
float norm1 = DOT3(v1, v1); \
float norm2 = DOT3(v2, v2); \
dp /= sqrt(norm1 * norm2); \
dp = min(dp, 1.0f); \
dp = max(dp, -1.0f); \
}
#define CROSS_PRODUCT(v1, v2, c) \
c.x = v1.y * v2.z - v1.z * v2.y; \
c.y = v1.z * v2.x - v1.x * v2.z; \
c.z = v1.x * v2.y - v1.y * v2.x;
#define GETPREFACTORSGIVENANGLECOSINE(cosine, param, dEdR) \
{ \
float angle = acos(cosine); \
float deltaIdeal = angle - (param.x * (3.14159265f / 180.0f)); \
dEdR = param.y * deltaIdeal; \
}
#define GETANGLEBETWEENTWOVECTORS(v1, v2, angle) \
{ \
float dp; \
GETNORMEDDOTPRODUCT(v1, v2, dp); \
angle = acos(dp); \
}
#define GETANGLECOSINEBETWEENTWOVECTORS(v1, v2, angle, cosine) \
{ \
GETNORMEDDOTPRODUCT(v1, v2, cosine); \
angle = acos(cosine); \
}
#define GETDIHEDRALANGLEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle) \
{ \
CROSS_PRODUCT(vector1, vector2, cp0); \
CROSS_PRODUCT(vector2, vector3, cp1); \
GETANGLEBETWEENTWOVECTORS(cp0, cp1, angle); \
float dp = DOT3(signVector, cp1); \
angle = (dp >= 0) ? angle : -angle; \
}
#define GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(vector1, vector2, vector3, signVector, cp0, cp1, angle, cosine) \
{ \
CROSS_PRODUCT(vector1, vector2, cp0); \
CROSS_PRODUCT(vector2, vector3, cp1); \
GETANGLECOSINEBETWEENTWOVECTORS(cp0, cp1, angle, cosine); \
float dp = DOT3(signVector, cp1); \
angle = (dp >= 0) ? angle : -angle; \
}
void SetCalculateLocalForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateLocalForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kCalculateLocalForces_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
Vectors* A = &sV[threadIdx.x];
while (pos < cSim.bond_offset)
{
if (pos < cSim.bonds)
{
int4 atom = cSim.pBondID[pos];
float4 atomA = cSim.pPosq[atom.x];
float4 atomB = cSim.pPosq[atom.y];
float2 bond = cSim.pBondParameter[pos];
float dx = atomB.x - atomA.x;
float dy = atomB.y - atomA.y;
float dz = atomB.z - atomA.z;
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrt(r2);
float deltaIdeal = r - bond.x;
float dEdR = bond.y * deltaIdeal;
dEdR = (r > 0.0f) ? (dEdR / r) : 0.0f;
// printf("D: %11.4f %11.4f %11.4f %11.4f %11.4f %11.4f\n", dx, dy, dz, r, deltaIdeal, dEdR);
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
unsigned int offsetA = atom.x + atom.z * cSim.stride;
unsigned int offsetB = atom.y + atom.w * cSim.stride;
float4 forceA = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.z < cSim.totalNonbondOutputBuffers)
forceA = cSim.pForce4[offsetA];
float4 forceB = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.w < cSim.totalNonbondOutputBuffers)
forceB = cSim.pForce4[offsetB];
forceA.x += dx;
forceA.y += dy;
forceA.z += dz;
forceB.x -= dx;
forceB.y -= dy;
forceB.z -= dz;
cSim.pForce4[offsetA] = forceA;
cSim.pForce4[offsetB] = forceB;
}
pos += blockDim.x * gridDim.x;
}
while (pos < cSim.bond_angle_offset)
{
unsigned int pos1 = pos - cSim.bond_offset;
if (pos1 < cSim.bond_angles)
{
int4 atom1 = cSim.pBondAngleID1[pos1];
float2 bond_angle = cSim.pBondAngleParameter[pos1];
float4 a1 = cSim.pPosq[atom1.x];
float4 a2 = cSim.pPosq[atom1.y];
float4 a3 = cSim.pPosq[atom1.z];
A->v0.x = a2.x - a1.x;
A->v0.y = a2.y - a1.y;
A->v0.z = a2.z - a1.z;
A->v1.x = a2.x - a3.x;
A->v1.y = a2.y - a3.y;
A->v1.z = a2.z - a3.z;
float3 cp;
CROSS_PRODUCT(A->v0, A->v1, cp);
float rp = DOT3(cp, cp); //cx * cx + cy * cy + cz * cz;
rp = max(sqrt(rp), 1.0e-06f);
float r21 = DOT3(A->v0, A->v0); // dx1 * dx1 + dy1 * dy1 + dz1 * dz1;
float r23 = DOT3(A->v1, A->v1); // dx2 * dx2 + dy2 * dy2 + dz2 * dz2;
float dot = DOT3(A->v0, A->v1); // dx1 * dx2 + dy1 * dy2 + dz1 * dz2;
float cosine = dot / sqrt(r21 * r23);
float dEdR;
GETPREFACTORSGIVENANGLECOSINE(cosine, bond_angle, dEdR);
//printf("%11.4f %11.4f\n", cosine, dEdR);
float termA = dEdR / (r21 * rp);
float termC = -dEdR / (r23 * rp);
float3 c21;
float3 c23;
CROSS_PRODUCT(A->v0, cp, c21);
CROSS_PRODUCT(A->v1, cp, c23);
c21.x *= termA;
c21.y *= termA;
c21.z *= termA;
c23.x *= termC;
c23.y *= termC;
c23.z *= termC;
int2 atom2 = cSim.pBondAngleID2[pos1];
unsigned int offset = atom1.x + atom1.w * cSim.stride;
float4 force = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom1.w < cSim.totalNonbondOutputBuffers)
force = cSim.pForce4[offset];
force.x += c21.x;
force.y += c21.y;
force.z += c21.z;
cSim.pForce4[offset] = force;
offset = atom1.y + atom2.x * cSim.stride;
force.x = force.y = force.z = 0.0f;
if (atom2.x < cSim.totalNonbondOutputBuffers)
force = cSim.pForce4[offset];
force.x -= (c21.x + c23.x);
force.y -= (c21.y + c23.y);
force.z -= (c21.z + c23.z);
cSim.pForce4[offset] = force;
offset = atom1.z + atom2.y * cSim.stride;
force.x = force.y = force.z = 0.0f;
if (atom2.y < cSim.totalNonbondOutputBuffers)
force = cSim.pForce4[offset];
force.x += c23.x;
force.y += c23.y;
force.z += c23.z;
cSim.pForce4[offset] = force;
}
pos += blockDim.x * gridDim.x;
}
while (pos < cSim.dihedral_offset)
{
unsigned int pos1 = pos - cSim.bond_angle_offset;
if (pos1 < cSim.dihedrals)
{
int4 atom1 = cSim.pDihedralID1[pos1];
float4 atomA = cSim.pPosq[atom1.x];
float4 atomB = cSim.pPosq[atom1.y];
float4 atomC = cSim.pPosq[atom1.z];
float4 atomD = cSim.pPosq[atom1.w];
A->v0.x = atomA.x - atomB.x;
A->v0.y = atomA.y - atomB.y;
A->v0.z = atomA.z - atomB.z;
A->v1.x = atomC.x - atomB.x;
A->v1.y = atomC.y - atomB.y;
A->v1.z = atomC.z - atomB.z;
A->v2.x = atomC.x - atomD.x;
A->v2.y = atomC.y - atomD.y;
A->v2.z = atomC.z - atomD.z;
float3 cp0, cp1;
float dihedralAngle;
GETDIHEDRALANGLEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle);
float4 dihedral = cSim.pDihedralParameter[pos1];
float deltaAngle = dihedral.z * dihedralAngle - (dihedral.y * 3.14159265f / 180.0f);
float sinDeltaAngle = sin(deltaAngle);
float dEdAngle = -dihedral.x * dihedral.z * sinDeltaAngle;
float normCross1 = DOT3(cp0, cp0);
float normBC = sqrt(DOT3(A->v1, A->v1));
float4 ff;
ff.x = (-dEdAngle * normBC) / normCross1;
float normCross2 = DOT3(cp1, cp1);
ff.w = (dEdAngle * normBC) / normCross2;
float dp = 1.0f / DOT3(A->v1, A->v1);
ff.y = DOT3(A->v0, A->v1) * dp;
ff.z = DOT3(A->v2, A->v1) * dp;
int4 atom2 = cSim.pDihedralID2[pos1];
float3 internalF0;
float3 internalF3;
float3 s;
// printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);
unsigned int offset = atom1.x + atom2.x * cSim.stride;
float4 force = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom2.x < cSim.totalNonbondOutputBuffers)
force = cSim.pForce4[offset];
internalF0.x = ff.x * cp0.x;
force.x += internalF0.x;
internalF0.y = ff.x * cp0.y;
force.y += internalF0.y;
internalF0.z = ff.x * cp0.z;
force.z += internalF0.z;
cSim.pForce4[offset] = force;
//printf("%4d - 0: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
offset = atom1.w + atom2.w * cSim.stride;
force.x = force.y = force.z = 0.0f;
if (atom2.w < cSim.totalNonbondOutputBuffers)
force = cSim.pForce4[offset];
internalF3.x = ff.w * cp1.x;
force.x += internalF3.x;
internalF3.y = ff.w * cp1.y;
force.y += internalF3.y;
internalF3.z = ff.w * cp1.z;
force.z += internalF3.z;
cSim.pForce4[offset] = force;
// printf("%4d - 3: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
s.x = ff.y * internalF0.x - ff.z * internalF3.x;
s.y = ff.y * internalF0.y - ff.z * internalF3.y;
s.z = ff.y * internalF0.z - ff.z * internalF3.z;
offset = atom1.y + atom2.y * cSim.stride;
force.x = force.y = force.z = 0.0f;
if (atom2.y < cSim.totalNonbondOutputBuffers)
force = cSim.pForce4[offset];
force.x += -internalF0.x + s.x;
force.y += -internalF0.y + s.y;
force.z += -internalF0.z + s.z;
cSim.pForce4[offset] = force;
//printf("%4d - 1: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
offset = atom1.z + atom2.z * cSim.stride;
force.x = force.y = force.z = 0.0f;
if (atom2.z < cSim.totalNonbondOutputBuffers)
force = cSim.pForce4[offset];
force.x += -internalF3.x - s.x;
force.y += -internalF3.y - s.y;
force.z += -internalF3.z - s.z;
cSim.pForce4[offset] = force;
//printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
}
pos += blockDim.x * gridDim.x;
}
while (pos < cSim.rb_dihedral_offset)
{
unsigned int pos1 = pos - cSim.dihedral_offset;
if (pos1 < cSim.rb_dihedrals)
{
int4 atom1 = cSim.pRbDihedralID1[pos1];
float4 atomA = cSim.pPosq[atom1.x];
float4 atomB = cSim.pPosq[atom1.y];
float4 atomC = cSim.pPosq[atom1.z];
float4 atomD = cSim.pPosq[atom1.w];
A->v0.x = atomA.x - atomB.x;
A->v0.y = atomA.y - atomB.y;
A->v0.z = atomA.z - atomB.z;
A->v1.x = atomC.x - atomB.x;
A->v1.y = atomC.y - atomB.y;
A->v1.z = atomC.z - atomB.z;
A->v2.x = atomC.x - atomD.x;
A->v2.y = atomC.y - atomD.y;
A->v2.z = atomC.z - atomD.z;
float3 cp0, cp1;
float dihedralAngle, cosPhi;
// printf("%4d - 0 : %9.4f %9.4f %9.4f\n", pos1, A->v0.x, A->v0.y, A->v0.z);
// printf("%4d - 1 : %9.4f %9.4f %9.4f\n", pos1, A->v1.x, A->v1.y, A->v1.z);
// printf("%4d - 2 : %9.4f %9.4f %9.4f\n", pos1, A->v2.x, A->v2.y, A->v2.z);
GETDIHEDRALANGLECOSINEBETWEENTHREEVECTORS(A->v0, A->v1, A->v2, A->v0, cp0, cp1, dihedralAngle, cosPhi);
if (dihedralAngle < 0.0f )
{
dihedralAngle += 3.14159265f;
}
else
{
dihedralAngle -= 3.14159265f;
}
cosPhi = -cosPhi;
// printf("%4d: %9.4f %9.4f\n", pos1, dihedralAngle, cosPhi);
float4 dihedral1 = cSim.pRbDihedralParameter1[pos1];
float2 dihedral2 = cSim.pRbDihedralParameter2[pos1];
float cosFactor = cosPhi;
float dEdAngle = -dihedral1.y;
// printf("%4d - 1: %9.4f %9.4f\n", pos1, dEdAngle, 1.0f);
dEdAngle -= 2.0f * dihedral1.z * cosFactor;
// printf("%4d - 2: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor *= cosPhi;
dEdAngle -= 3.0f * dihedral1.w * cosFactor;
// printf("%4d - 3: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor *= cosPhi;
dEdAngle -= 4.0f * dihedral2.x * cosFactor;
// printf("%4d - 4: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
cosFactor *= cosPhi;
dEdAngle -= 5.0f * dihedral2.y * cosFactor;
// printf("%4d - 5: %9.4f %9.4f\n", pos1, dEdAngle, cosFactor);
dEdAngle *= sin(dihedralAngle);
// printf("%4d - f: %9.4f\n", pos1, dEdAngle);
float normCross1 = DOT3(cp0, cp0);
float normBC = sqrt(DOT3(A->v1, A->v1));
float4 ff;
ff.x = (-dEdAngle * normBC) / normCross1;
float normCross2 = DOT3(cp1, cp1);
ff.w = (dEdAngle * normBC) / normCross2;
float dp = 1.0f / DOT3(A->v1, A->v1);
ff.y = DOT3(A->v0, A->v1) * dp;
ff.z = DOT3(A->v2, A->v1) * dp;
int4 atom2 = cSim.pRbDihedralID2[pos1];
float3 internalF0;
float3 internalF3;
float3 s;
// printf("%4d: %9.4f %9.4f %9.4f %9.4f\n", pos1, ff.x, ff.y, ff.z, ff.w);
unsigned int offset = atom1.x + atom2.x * cSim.stride;
float4 force = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom2.x < cSim.totalNonbondOutputBuffers)
force = cSim.pForce4[offset];
internalF0.x = ff.x * cp0.x;
force.x += internalF0.x;
internalF0.y = ff.x * cp0.y;
force.y += internalF0.y;
internalF0.z = ff.x * cp0.z;
force.z += internalF0.z;
cSim.pForce4[offset] = force;
// printf("%4d - 0: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
offset = atom1.w + atom2.w * cSim.stride;
force.x = force.y = force.z = 0.0f;
if (atom2.w < cSim.totalNonbondOutputBuffers)
force = cSim.pForce4[offset];
internalF3.x = ff.w * cp1.x;
force.x += internalF3.x;
internalF3.y = ff.w * cp1.y;
force.y += internalF3.y;
internalF3.z = ff.w * cp1.z;
force.z += internalF3.z;
cSim.pForce4[offset] = force;
// printf("%4d - 3: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
s.x = ff.y * internalF0.x - ff.z * internalF3.x;
s.y = ff.y * internalF0.y - ff.z * internalF3.y;
s.z = ff.y * internalF0.z - ff.z * internalF3.z;
offset = atom1.y + atom2.y * cSim.stride;
force.x = force.y = force.z = 0.0f;
if (atom2.y < cSim.totalNonbondOutputBuffers)
force = cSim.pForce4[offset];
force.x += -internalF0.x + s.x;
force.y += -internalF0.y + s.y;
force.z += -internalF0.z + s.z;
cSim.pForce4[offset] = force;
// printf("%4d - 1: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
offset = atom1.z + atom2.z * cSim.stride;
force.x = force.y = force.z = 0.0f;
if (atom2.z < cSim.totalNonbondOutputBuffers)
force = cSim.pForce4[offset];
force.x += -internalF3.x - s.x;
force.y += -internalF3.y - s.y;
force.z += -internalF3.z - s.z;
cSim.pForce4[offset] = force;
// printf("%4d - 2: %9.4f %9.4f %9.4f\n", pos1, cSim.pForce[offset], cSim.pForce[offset + cSim.stride], cSim.pForce[offset + cSim.stride2]);
}
pos += blockDim.x * gridDim.x;
}
while (pos < cSim.LJ14_offset)
{
unsigned int pos1 = pos - cSim.rb_dihedral_offset;
if (pos1 < cSim.LJ14s)
{
int4 atom = cSim.pLJ14ID[pos1];
float4 LJ14 = cSim.pLJ14Parameter[pos1];
float4 a1 = cSim.pPosq[atom.x];
float4 a2 = cSim.pPosq[atom.y];
float3 d;
d.x = a1.x - a2.x;
d.y = a1.y - a2.y;
d.z = a1.z - a2.z;
float r2 = DOT3(d, d);
float inverseR = 1.0f / sqrt(r2);
float sig2 = inverseR * LJ14.y;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
dEdR += LJ14.z * inverseR;
dEdR *= inverseR * inverseR;
unsigned int offsetA = atom.x + atom.z * cSim.stride;
unsigned int offsetB = atom.y + atom.w * cSim.stride;
float4 forceA = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.z < cSim.totalNonbondOutputBuffers)
forceA = cSim.pForce4[offsetA];
float4 forceB = {0.0f, 0.0f, 0.0f, 0.0f};
if (atom.w < cSim.totalNonbondOutputBuffers)
forceB = cSim.pForce4[offsetB];
d.x *= dEdR;
d.y *= dEdR;
d.z *= dEdR;
forceA.x += d.x;
forceA.y += d.y;
forceA.z += d.z;
forceB.x -= d.x;
forceB.y -= d.y;
forceB.z -= d.z;
cSim.pForce4[offsetA] = forceA;
cSim.pForce4[offsetB] = forceB;
}
pos += blockDim.x * gridDim.x;
}
}
void kCalculateLocalForces(gpuContext gpu)
{
// printf("kCalculateLocalForces\n");
kCalculateLocalForces_kernel<<<gpu->sim.blocks, gpu->sim.localForces_threads_per_block, gpu->sim.localForces_threads_per_block * sizeof(Vectors)>>>();
LAUNCHERROR("kCalculateLocalForces");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
#define UNROLLXX 0
#define UNROLLXY 0
struct Atom {
float x;
float y;
float z;
float r;
float sr;
float sum;
float junk;
};
__shared__ Atom sA[GT2XX_NONBOND_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim;
void SetCalculateObcGbsaBornSumSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateObcGbsaBornSumSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kReduceObcGbsaBornSum_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
while (pos < cSim.atoms)
{
float sum = 0.0f;
float* pSt = cSim.pBornSum + pos;
float2 atom = cSim.pObcData[pos];
// Get summed Born data
for (int i = 0; i < cSim.nonbondOutputBuffers; i++)
{
sum += *pSt;
// printf("%4d %4d A: %9.4f\n", pos, i, *pSt);
pSt += cSim.stride;
}
// Now calculate Born radius and OBC term.
sum *= 0.5f * atom.x;
float sum2 = sum * sum;
float sum3 = sum * sum2;
float tanhSum = tanh(cSim.alphaOBC * sum - cSim.betaOBC * sum2 + cSim.gammaOBC * sum3);
float nonOffsetRadii = atom.x + cSim.dielectricOffset;
float bornRadius = 1.0f / (1.0f / atom.x - tanhSum / nonOffsetRadii);
float obcChain = atom.x * (cSim.alphaOBC - 2.0f * cSim.betaOBC * sum + 3.0f * cSim.gammaOBC * sum2);
obcChain = (1.0f - tanhSum * tanhSum) * obcChain / nonOffsetRadii;
cSim.pBornRadii[pos] = bornRadius;
cSim.pObcChain[pos] = obcChain;
pos += gridDim.x * blockDim.x;
}
}
void kReduceObcGbsaBornSum(gpuContext gpu)
{
// printf("kReduceObcGbsaBornSum\n");
kReduceObcGbsaBornSum_kernel<<<gpu->sim.blocks, 384>>>();
gpu->bRecalculateBornRadii = false;
if( 0 ){
static int step = 0;
int numPrint = -1;
step++;
WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
WriteArrayToFile1( gpu, "ObcGbsaBornSum", step, gpu->psBornSum, numPrint );
WriteArrayToFile2( gpu, "ObcGbsaObcData", step, gpu->psObcData, numPrint );
WriteArrayToFile4( gpu, "ObcGbsaBornPos", step, gpu->psPosq4, numPrint );
//gpuDumpCoordinates( gpu );
gpuDumpObcInfo( gpu );
}
LAUNCHERROR("kReduceObcGbsaBornSum");
}
__global__ void kCalculateObcGbsaBornSum_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = (blockIdx.x * cSim.workUnits) / gridDim.x;
int end = ((blockIdx.x + 1) * cSim.workUnits) / gridDim.x;
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x - 1) & (GRID - 1);
}
__syncthreads();
// Now change pos and end to reflect work queue just read
// into shared memory
end = end - pos;
pos = end - (threadIdx.x >> GRIDBITS) - 1;
while (pos >= 0)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS;
float dx;
float dy;
float dz;
float r2;
float r;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i]; // Local atom x, y, z, sum
float2 ar = cSim.pObcData[i]; // Local atom vr, sr
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].r = ar.x;
sA[threadIdx.x].sr = ar.y;
apos.w = 0.0f;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
r = sqrt(r2);
float rInverse = 1.0f / r;
float rScaledRadiusJ = r + psA[j].sr;
if ((j != tgx) && (ar.x < rScaledRadiusJ))
{
float l_ij = 1.0f / max(ar.x, fabs(r - psA[j].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = log(u_ij / l_ij);
apos.w += l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * psA[j].sr * psA[j].sr * rInverse) *
(l_ij2 - u_ij2);
if (ar.x < (psA[j].r - r))
{
apos.w += 2.0f * ((1.0f / ar.x) - l_ij);
}
}
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = apos.w;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pObcData[j];
float4 apos = cSim.pPosq[i]; // Local atom x, y, z, sum
float2 ar = cSim.pObcData[i]; // Local atom vr, sr
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].r = temp1.x;
sA[threadIdx.x].sr = temp1.y;
sA[threadIdx.x].sum = apos.w = 0.0f;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
r2 = dx * dx + dy * dy + dz * dz;
r = sqrt(r2);
float rInverse = 1.0f / r;
float rScaledRadiusJ = r + psA[tj].sr;
if (ar.x < rScaledRadiusJ)
{
float l_ij = 1.0f / max(ar.x, fabs(r - psA[tj].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = log(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * psA[tj].sr * psA[tj].sr * rInverse) *
(l_ij2 - u_ij2);
if (ar.x < (psA[tj].sr - r))
{
term += 2.0f * ((1.0f / ar.x) - l_ij);
}
apos.w += term;
}
float rScaledRadiusI = r + ar.y;
if (psA[tj].r < rScaledRadiusI)
{
float l_ij = 1.0f / max(psA[tj].r, fabs(r - ar.y));
float u_ij = 1.0f / rScaledRadiusI;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = log(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * ar.y * ar.y * rInverse) *
(l_ij2 - u_ij2);
if (psA[tj].r < (ar.y - r))
{
term += 2.0f * ((1.0f / psA[tj].r) - l_ij);
}
psA[tj].sum += term;
}
tj = sNext[tj];
}
// Write results
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = apos.w;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = sA[threadIdx.x].sum;
}
pos -= cSim.nonbond_workBlock;
}
}
void kCalculateObcGbsaBornSum(gpuContext gpu)
{
// printf("kCalculateObcgbsaBornSum\n");
kCalculateObcGbsaBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kCalculateBornSum");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
struct Atom {
float x;
float y;
float z;
float q;
float br;
float fx;
float fy;
float fz;
float fb;
};
__shared__ Atom sA[G8X_NONBOND_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim;
void SetCalculateObcGbsaForces1Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateObcGbsaForces1Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kReduceObcGbsaBornForces_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
while (pos < cSim.atoms)
{
float bornRadius = cSim.pBornRadii[pos];
float obcChain = cSim.pObcChain[pos];
float2 obcData = cSim.pObcData[pos];
float totalForce = 0.0f;
float* pFt = cSim.pBornForce + pos;
int i = cSim.nonbondOutputBuffers;
while (i >= 4)
{
float f1 = *pFt;
pFt += cSim.stride;
float f2 = *pFt;
pFt += cSim.stride;
float f3 = *pFt;
pFt += cSim.stride;
float f4 = *pFt;
pFt += cSim.stride;
totalForce += f1 + f2 + f3 + f4;
i -= 4;
}
if (i >= 2)
{
float f1 = *pFt;
pFt += cSim.stride;
float f2 = *pFt;
pFt += cSim.stride;
totalForce += f1 + f2;
i -= 2;
}
if (i > 0)
{
totalForce += *pFt;
}
// __syncthreads();
//printf("%4d: %9.4f %9.4f %9.4f\n", pos, totalForce, bornRadius, obcChain);
//totalForce = 0.0f;
// if (bornRadius > 0.0f)
// {
float r = (obcData.x + cSim.dielectricOffset + cSim.probeRadius);
float ratio6 = pow((obcData.x + cSim.dielectricOffset) / bornRadius, 6.0f);
//float saTerm = cSim.surfaceAreaFactor * r * r * ratio6;
float saTerm = cSim.surfaceAreaFactor * r * r * ratio6;
totalForce += saTerm / bornRadius; // 1.102 == Temp mysterious fudge factor, FIX FIX FIX
// }
totalForce *= bornRadius * bornRadius * obcChain;
pFt = cSim.pBornForce + pos;
*pFt = totalForce;
pos += gridDim.x * blockDim.x;
}
}
__global__ void kReduceObcGbsaBornForces1_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
while (pos < cSim.atoms)
{
float bornRadius = cSim.pBornRadii[pos];
float obcChain = cSim.pObcChain[pos];
//float2 obcData = cSim.pObcData[pos];
float totalForce = 0.0f;
float* pFt = cSim.pBornForce + pos;
int i = cSim.nonbondOutputBuffers;
while (i >= 4)
{
float f1 = *pFt;
pFt += cSim.stride;
float f2 = *pFt;
pFt += cSim.stride;
float f3 = *pFt;
pFt += cSim.stride;
float f4 = *pFt;
pFt += cSim.stride;
totalForce += f1 + f2 + f3 + f4;
i -= 4;
}
if (i >= 2)
{
float f1 = *pFt;
pFt += cSim.stride;
float f2 = *pFt;
pFt += cSim.stride;
totalForce += f1 + f2;
i -= 2;
}
if (i > 0)
{
totalForce += *pFt;
}
// __syncthreads();
//printf("%4d: %9.4f %9.4f %9.4f\n", pos, totalForce, bornRadius, obcChain);
//totalForce = 0.0f;
/*
// if (bornRadius > 0.0f)
// {
float r = (obcData.x + cSim.dielectricOffset + cSim.probeRadius);
float ratio6 = pow((obcData.x + cSim.dielectricOffset) / bornRadius, 6.0f);
float saTerm = cSim.surfaceAreaFactor * r * r * ratio6;
totalForce += saTerm / bornRadius; // 1.102 == Temp mysterious fudge factor, FIX FIX FIX
// }
*/
totalForce *= bornRadius * bornRadius * obcChain;
cSim.pBornForce[pos] = totalForce;
pos += gridDim.x * blockDim.x;
}
}
__global__ void kAceGbsa_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
while (pos < cSim.atoms)
{
float bornRadius = cSim.pBornRadii[pos];
float obcChain = cSim.pObcChain[pos];
float2 obcData = cSim.pObcData[pos];
float totalForce = cSim.pBornForce[pos];
//float totalForce = 0.0f;
float r = (obcData.x + cSim.dielectricOffset + cSim.probeRadius);
float ratio6 = pow((obcData.x + cSim.dielectricOffset) / bornRadius, 6.0f);
/*
float ratio6 = (obcData.x + cSim.dielectricOffset) / bornRadius;
ratio6 = ratio6*ratio6;
ratio6 = ratio6*ratio6*ratio6;
*/
//float saTerm = 41.84f*cSim.surfaceAreaFactor * r * r * ratio6;
float saTerm = cSim.surfaceAreaFactor * r * r * ratio6;
totalForce += saTerm / bornRadius; // 1.102 == Temp mysterious fudge factor, FIX FIX FIX
totalForce *= bornRadius * bornRadius * obcChain;
cSim.pBornForce[pos] = totalForce;
pos += gridDim.x * blockDim.x;
}
}
void kReduceObcGbsaBornForces(gpuContext gpu)
{
//printf("kReduceObcGbsaBornForces QQ\n");
kReduceObcGbsaBornForces_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
//kReduceObcGbsaBornForces1_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
//kAceGbsa_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
//printf("kReduceObcGbsaBornForces calling gpuDumpObcLoop1 QQ\n");
//gpuDumpObcLoop1(gpu);
}
__global__ void kCalculateObcGbsaForces1_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
}
__syncthreads();
// Now change pos and end to reflect work queue just read
// into shared memory
end = end - pos;
pos = end - (threadIdx.x >> GRIDBITS) - 1;
while (pos >= 0)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS;
float4 apos; // Local atom x, y, z, q
float4 af; // Local atom fx, fy, fz, fb
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int i = x + tgx;
apos = cSim.pPosq[i];
float br = cSim.pBornRadii[i];
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = apos.w;
sA[threadIdx.x].br = br;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
af.w = 0.0f;
apos.w *= cSim.preFactor;
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
float alpha2_ij = br * psA[j].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (apos.w * psA[j].q) / (denominator * denominator2);
float dGpol_dr = Gpol * (1.0f - 0.25f * expTerm);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
dx *= dGpol_dr;
dy *= dGpol_dr;
dz *= dGpol_dr;
af.x -= dx;
af.y -= dy;
af.z -= dz;
af.w += dGpol_dalpha2_ij * psA[j].br;
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float temp1 = cSim.pBornRadii[j];
apos = cSim.pPosq[i];
float br = cSim.pBornRadii[i];
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp.w;
sA[threadIdx.x].br = temp1;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
sA[threadIdx.x].fb = af.w = 0.0f;
apos.w *= cSim.preFactor;
for (j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
float alpha2_ij = br * psA[tj].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (apos.w * psA[tj].q) / (denominator * denominator2);
float dGpol_dr = Gpol * (1.0f - 0.25f * expTerm);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
dx *= dGpol_dr;
dy *= dGpol_dr;
dz *= dGpol_dr;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
af.w += dGpol_dalpha2_ij * psA[tj].br;
psA[tj].fb += dGpol_dalpha2_ij * br;
tj = sNext[tj];
}
// Write results
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
af.x = sA[threadIdx.x].fx;
af.y = sA[threadIdx.x].fy;
af.z = sA[threadIdx.x].fz;
af.w = sA[threadIdx.x].fb;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
}
pos -= cSim.nonbond_workBlock;
}
}
__global__ extern void kCalculateObcGbsaForces1_12_kernel();
void kCalculateObcGbsaForces1(gpuContext gpu)
{
//printf("kCalculateObcGbsaForces1 version=%d sm_12=%d QQ\n", gpu->sm_version, SM_12);
if (gpu->sm_version < SM_12)
kCalculateObcGbsaForces1_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
else
kCalculateObcGbsaForces1_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kCalculateObcGbsaForce1");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
struct Atom {
float x;
float y;
float z;
float q;
float br;
float fx;
float fy;
float fz;
float fb;
};
__shared__ Atom sA[GT2XX_NONBOND_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim;
void SetCalculateObcGbsaForces1_12Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateObcGbsaForces1_12Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kCalculateObcGbsaForces1_12_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = cSim.nbWorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.nbWorkUnitsPerBlockRemainder);
int end = cSim.nbWorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.nbWorkUnitsPerBlockRemainder);
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
}
__syncthreads();
// Now change pos and end to reflect work queue just read
// into shared memory
end = end - pos;
pos = end - (threadIdx.x >> GRIDBITS) - 1;
while (pos >= 0)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS;
float4 apos; // Local atom x, y, z, q
float4 af; // Local atom fx, fy, fz, fb
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int i = x + tgx;
apos = cSim.pPosq[i];
float br = cSim.pBornRadii[i];
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = apos.w;
sA[threadIdx.x].br = br;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
af.w = 0.0f;
apos.w *= cSim.preFactor;
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
float alpha2_ij = br * psA[j].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (apos.w * psA[j].q) / (denominator * denominator2);
float dGpol_dr = Gpol * (1.0f - 0.25f * expTerm);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
dx *= dGpol_dr;
dy *= dGpol_dr;
dz *= dGpol_dr;
af.x -= dx;
af.y -= dy;
af.z -= dz;
af.w += dGpol_dalpha2_ij * psA[j].br;
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float temp1 = cSim.pBornRadii[j];
apos = cSim.pPosq[i];
float br = cSim.pBornRadii[i];
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp.w;
sA[threadIdx.x].br = temp1;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
sA[threadIdx.x].fb = af.w = 0.0f;
apos.w *= cSim.preFactor;
for (j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
float alpha2_ij = br * psA[tj].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = exp(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrt(denominator2);
float Gpol = (apos.w * psA[tj].q) / (denominator * denominator2);
float dGpol_dr = Gpol * (1.0f - 0.25f * expTerm);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
dx *= dGpol_dr;
dy *= dGpol_dr;
dz *= dGpol_dr;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
af.w += dGpol_dalpha2_ij * psA[tj].br;
psA[tj].fb += dGpol_dalpha2_ij * br;
tj = sNext[tj];
}
// Write results
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
af.x = sA[threadIdx.x].fx;
af.y = sA[threadIdx.x].fy;
af.z = sA[threadIdx.x].fz;
af.w = sA[threadIdx.x].fb;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4a[offset] = af;
cSim.pBornForce[offset] = af.w;
}
pos -= cSim.nonbond_workBlock;
}
}
void kCalculateObcGbsaForces1_12(gpuContext gpu)
{
// printf("kCalculateObcGbsaForces1_12\n");
kCalculateObcGbsaForces1_12_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block>>>();
LAUNCHERROR("kCalculateObcGbsaForce1_12");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
#include "cudaKernels.h"
struct Atom {
float x;
float y;
float z;
float r;
float sr;
float sr2;
float fx;
float fy;
float fz;
float fb;
// float sum;
// float oneOverR;
int pos;
int wx;
int wy;
};
__shared__ Atom sA[G8X_BORNFORCE2_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[G8X_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim;
void SetCalculateObcGbsaForces2Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateObcGbsaForces2Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kCalculateObcGbsaForces2_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = cSim.bf2WorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.bf2WorkUnitsPerBlockRemainder);
int end = cSim.bf2WorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.bf2WorkUnitsPerBlockRemainder);
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
}
__syncthreads();
// Now change pos and end to reflect work queue just read
// into shared memory
end = end - pos;
sA[threadIdx.x].pos = end - (threadIdx.x >> GRIDBITS) - 1;
while (sA[threadIdx.x].pos >= 0)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[sA[threadIdx.x].pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i];
float2 a = cSim.pObcData[i];
float fb = cSim.pBornForce[i];
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
sA[threadIdx.x].wx = x;
sA[threadIdx.x].wy = y;
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
float3 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
// float sum = 0.0f;
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
// float oneOverR = 1.0f / a.x;
sA[threadIdx.x].r = a.x;
sA[threadIdx.x].sr = a.y;
sA[threadIdx.x].sr2 = a.y * a.y;
sA[threadIdx.x].fb = fb;
for (unsigned int j = sNext[tgx]; j != tgx; j = sNext[j])
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrt(r2);
// Atom I Born forces and sum
float rScaledRadiusJ = r + psA[j].sr;
float l_ij = 1.0f / max(a.x, fabs(r - psA[j].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float rInverse = 1.0f / r;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float r2Inverse = rInverse * rInverse;
float t1 = log (u_ij / l_ij);
float t2 = (l_ij2 - u_ij2);
float t3 = t2 * rInverse;
t1 *= rInverse;
// Born Forces term
float term = 0.125f *
(1.000f + psA[j].sr2 * r2Inverse) * t3 +
0.250f * t1 * r2Inverse;
float dE = fb * term;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[j].sr2) * t3;
// if (a.x < (psA[j].sr - r))
// {
// term += 2.0f * (oneOverR - l_ij);
// }
if (a.x >= rScaledRadiusJ)
{
dE = /*term =*/ 0.0f;
}
float d = dx * dE;
af.x -= d;
psA[j].fx += d;
d = dy * dE;
af.y -= d;
psA[j].fy += d;
d = dz * dE;
af.z -= d;
psA[j].fz += d;
// sum += term;
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
float4 of;
of.x = af.x + sA[threadIdx.x].fx;
of.y = af.y + sA[threadIdx.x].fy;
of.z = af.z + sA[threadIdx.x].fz;
of.w = 0.0f;
cSim.pForce4b[offset] = of;
// cSim.pBornSum[offset] = sum;
}
else
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pObcData[j];
sA[threadIdx.x].fb = cSim.pBornForce[j];
float3 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
// sA[threadIdx.x].sum = 0.0f;
// float sum = 0.0f;
float sr2 = a.y * a.y;
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].r = temp1.x;
sA[threadIdx.x].sr = temp1.y;
sA[threadIdx.x].sr2 = temp1.y * temp1.y;
// sA[threadIdx.x].oneOverR = 1.0f / temp1.x;
for (j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrt(r2);
// Atom I Born Forces and sum
float r2Inverse = 1.0f / r2;
float rScaledRadiusJ = r + psA[tj].sr;
float rInverse = 1.0f / r;
float l_ij = 1.0f / max(a.x, fabs(r - psA[tj].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float t1 = log (u_ij / l_ij);
float t2 = (l_ij2 - u_ij2);
float t3 = t2 * rInverse;
t1 *= rInverse;
// Born Forces term
float term = 0.125f *
(1.000f + psA[tj].sr2 * r2Inverse) * t3 +
0.250f * t1 * r2Inverse;
float dE = fb * term;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[tj].sr2) * t3;
// if (a.x < (psA[tj].sr - r))
// {
// term += 2.0f * ((1.0f / a.x) - l_ij);
// }
if (a.x >= rScaledRadiusJ)
{
dE = /*term =*/ 0.0f;
}
float d = dx * dE;
af.x -= d;
psA[tj].fx += d;
d = dy * dE;
af.y -= d;
psA[tj].fy += d;
d = dz * dE;
af.z -= d;
psA[tj].fz += d;
// sum += term;
// Atom J Born Forces and sum
float rScaledRadiusI = r + a.y;
l_ij = 1.0f / max(psA[tj].r, fabs(r - a.y));
u_ij = 1.0f / rScaledRadiusI;
l_ij2 = l_ij * l_ij;
u_ij2 = u_ij * u_ij;
t1 = log (u_ij / l_ij);
t2 = (l_ij2 - u_ij2);
t3 = t2 * rInverse;
t1 *= rInverse;
// Born Forces term
term = 0.125f *
(1.000f + sr2 * r2Inverse) * t3 +
0.250f * t1 * r2Inverse;
dE = psA[tj].fb * term;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * sr2) * t3;
//
// if (psA[tj].r < (a.y - r))
// {
// term += 2.0f * (psA[tj].oneOverR - l_ij);
// }
if (psA[tj].r >= rScaledRadiusI)
{
dE = /*term =*/ 0.0f;
}
dx *= dE;
dy *= dE;
dz *= dE;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
af.x -= dx;
af.y -= dy;
af.z -= dz;
// psA[tj].sum += term;
tj = sNext[tj];
}
// Write results
int offset = sA[threadIdx.x].wx + tgx + (sA[threadIdx.x].wy >> GRIDBITS) * cSim.stride;
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
cSim.pForce4b[offset] = of;
// cSim.pBornSum[offset] = sum;
offset = sA[threadIdx.x].wy + tgx + (sA[threadIdx.x].wx >> GRIDBITS) * cSim.stride;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
cSim.pForce4b[offset] = of;
// cSim.pBornSum[offset] = sA[threadIdx.x].sum;
}
sA[threadIdx.x].pos -= cSim.bornForce2_workBlock;
}
}
__global__ extern void kCalculateObcGbsaForces2_12_kernel();
void kCalculateObcGbsaForces2(gpuContext gpu)
{
//printf("kCalculateObcGbsaForces2\n");
if (gpu->sm_version < SM_12)
kCalculateObcGbsaForces2_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block>>>();
else
kCalculateObcGbsaForces2_12_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block>>>();
if( 0 ){
static int step = 0;
//int numPrint = -1;
step++;
//WriteArrayToFile1( gpu, "ObcGbsaBornBRad", step, gpu->psBornRadii, numPrint );
//gpuDumpCoordinates( gpu );
kReduceBornSumAndForces( gpu );
gpuDumpObcLoop1( gpu );
}
LAUNCHERROR("kCalculateObcGbsaForces2");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
struct Atom {
float x;
float y;
float z;
float r;
float sr;
float sr2;
float fx;
float fy;
float fz;
float fb;
// float sum;
};
__shared__ Atom sA[GT2XX_BORNFORCE2_THREADS_PER_BLOCK];
__shared__ unsigned int sWorkUnit[GT2XX_NONBOND_WORKUNITS_PER_SM];
__shared__ unsigned int sNext[GRID];
static __constant__ cudaGmxSimulation cSim;
void SetCalculateObcGbsaForces2_12Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetCalculateObcGbsaForces2_12Sim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kCalculateObcGbsaForces2_12_kernel()
{
// Read queue of work blocks once so the remainder of
// kernel can run asynchronously
int pos = cSim.bf2WorkUnitsPerBlock * blockIdx.x + min(blockIdx.x, cSim.bf2WorkUnitsPerBlockRemainder);
int end = cSim.bf2WorkUnitsPerBlock * (blockIdx.x + 1) + min((blockIdx.x + 1), cSim.bf2WorkUnitsPerBlockRemainder);
if (threadIdx.x < end - pos)
{
sWorkUnit[threadIdx.x] = cSim.pWorkUnit[pos + threadIdx.x];
}
if (threadIdx.x < GRID)
{
sNext[threadIdx.x] = (threadIdx.x + 1) & (GRID - 1);
}
__syncthreads();
// Now change pos and end to reflect work queue just read
// into shared memory
end = end - pos;
pos = end - (threadIdx.x >> GRIDBITS) - 1;
while (pos >= 0)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = sWorkUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i];
float2 a = cSim.pObcData[i];
float fb = cSim.pBornForce[i];
unsigned int tbx = threadIdx.x - tgx;
int tj = tgx;
Atom* psA = &sA[tbx];
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
float3 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
// float sum = 0.0f;
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
// float oneOverR = 1.0f / a.x;
sA[threadIdx.x].r = a.x;
sA[threadIdx.x].sr = a.y;
sA[threadIdx.x].sr2 = a.y * a.y;
sA[threadIdx.x].fb = fb;
for (unsigned int j = sNext[tgx]; j != tgx; j = sNext[j])
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrt(r2);
// Atom I Born forces and sum
float rScaledRadiusJ = r + psA[j].sr;
float l_ij = 1.0f / max(a.x, fabs(r - psA[j].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float rInverse = 1.0f / r;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float r2Inverse = rInverse * rInverse;
float t1 = log (u_ij / l_ij);
float t2 = (l_ij2 - u_ij2);
float t3 = t2 * rInverse;
t1 *= rInverse;
// Born Forces term
float term = 0.125f *
(1.000f + psA[j].sr2 * r2Inverse) * t3 +
0.250f * t1 * r2Inverse;
float dE = fb * term;
// Born sum term
// term = l_ij - u_ij +
// -0.25f * r * t2 +
// 0.50f * t1 +
// (0.25f * psA[j].sr2) * t3;
// if (a.x < (psA[j].sr - r))
// {
// term += 2.0f * (oneOverR - l_ij);
// }
if (a.x >= rScaledRadiusJ)
{
dE = /*term =*/ 0.0f;
}
float d = dx * dE;
af.x -= d;
psA[j].fx += d;
d = dy * dE;
af.y -= d;
psA[j].fy += d;
d = dz * dE;
af.z -= d;
psA[j].fz += d;
// sum += term;
}
// Write results
int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
float4 of;
of.x = af.x + sA[threadIdx.x].fx;
of.y = af.y + sA[threadIdx.x].fy;
of.z = af.z + sA[threadIdx.x].fz;
of.w = 0.0f;
cSim.pForce4b[offset] = of;
// cSim.pBornSum[offset] = sum;
}
else
{
// Read fixed atom data into registers and GRF
int j = y + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pObcData[j];
sA[threadIdx.x].fb = cSim.pBornForce[j];
float3 af;
sA[threadIdx.x].fx = af.x = 0.0f;
sA[threadIdx.x].fy = af.y = 0.0f;
sA[threadIdx.x].fz = af.z = 0.0f;
// sA[threadIdx.x].sum = 0.0f;
// float sum = 0.0f;
float sr2 = a.y * a.y;
// float oneOverR = 1.0f / a.x;
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].r = temp1.x;
sA[threadIdx.x].sr = temp1.y;
sA[threadIdx.x].sr2 = temp1.y * temp1.y;
for (j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrt(r2);
// Interleaved Atom I and J Born Forces and sum components
float r2Inverse = 1.0f / r2;
float rScaledRadiusJ = r + psA[tj].sr;
float rScaledRadiusI = r + a.y;
float rInverse = 1.0f / r;
float l_ijJ = 1.0f / max(a.x, fabs(r - psA[tj].sr));
float l_ijI = 1.0f / max(psA[tj].r, fabs(r - a.y));
float u_ijJ = 1.0f / rScaledRadiusJ;
float u_ijI = 1.0f / rScaledRadiusI;
float l_ij2J = l_ijJ * l_ijJ;
float l_ij2I = l_ijI * l_ijI;
float u_ij2J = u_ijJ * u_ijJ;
float u_ij2I = u_ijI * u_ijI;
float t1J = log (u_ijJ / l_ijJ);
float t1I = log (u_ijI / l_ijI);
float t2J = (l_ij2J - u_ij2J);
float t2I = (l_ij2I - u_ij2I);
float t3J = t2J * rInverse;
float t3I = t2I * rInverse;
t1J *= rInverse;
t1I *= rInverse;
// Born Forces term
float term = 0.125f *
(1.000f + psA[tj].sr2 * r2Inverse) * t3J +
0.250f * t1J * r2Inverse;
float dE = fb * term;
// Atom I Born sum term
// term = l_ijJ - u_ijJ +
// -0.25f * r * t2J +
// 0.50f * t1J +
// (0.25f * psA[tj].sr2) * t3J;
// if (a.x < (psA[tj].sr - r))
// {
// term += 2.0f * (oneOverR - l_ijJ);
// }
if (a.x >= rScaledRadiusJ)
{
dE = /*term =*/ 0.0f;
}
float d = dx * dE;
af.x -= d;
psA[tj].fx += d;
d = dy * dE;
af.y -= d;
psA[tj].fy += d;
d = dz * dE;
af.z -= d;
psA[tj].fz += d;
// sum += term;
// Atom J Born sum term
term = 0.125f *
(1.000f + sr2 * r2Inverse) * t3I +
0.250f * t1I * r2Inverse;
dE = psA[tj].fb * term;
// term = l_ijI - u_ijI +
// -0.25f * r * t2I +
// 0.50f * t1I +
// (0.25f * sr2) * t3I;
// if (psA[tj].r < (a.y - r))
// {
// term += 2.0f * ((1.0f / psA[tj].r) - l_ijI);
// }
if (psA[tj].r >= rScaledRadiusI)
{
dE = /*term =*/ 0.0f;
}
dx *= dE;
dy *= dE;
dz *= dE;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
af.x -= dx;
af.y -= dy;
af.z -= dz;
// psA[tj].sum += term;
tj = sNext[tj];
}
// Write results
int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
float4 of;
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
cSim.pForce4b[offset] = of;
// cSim.pBornSum[offset] = sum;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
cSim.pForce4b[offset] = of;
// cSim.pBornSum[offset] = sA[threadIdx.x].sum;
}
pos -= cSim.bornForce2_workBlock;
}
}
void kCalculateObcGbsaForces2_12(gpuContext gpu)
{
// printf("kCalculateObcGbsaForces2_12\n");
kCalculateObcGbsaForces2_12_kernel<<<gpu->sim.bornForce2_blocks, gpu->sim.bornForce2_threads_per_block>>>();
LAUNCHERROR("kCalculateObcGbsaForces2_12");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
#define FABS(a) ((a) > 0.0f ? (a) : -(a))
static __constant__ cudaGmxSimulation cSim;
void SetForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetForcesSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kClearForces_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride4 * cSim.outputBuffers)
{
((float*)cSim.pForce4)[pos] = 0.0f;
pos += gridDim.x * blockDim.x;
}
}
void kClearForces(gpuContext gpu)
{
// printf("kClearForces\n");
kClearForces_kernel<<<gpu->sim.blocks, 384>>>();
LAUNCHERROR("kClearForces");
}
__global__ void kClearBornForces_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride * cSim.nonbondOutputBuffers)
{
((float*)cSim.pBornForce)[pos] = 0.0f;
pos += gridDim.x * blockDim.x;
}
}
void kClearBornForces(gpuContext gpu)
{
// printf("kClearBornForces\n");
kClearBornForces_kernel<<<gpu->sim.blocks, 384>>>();
LAUNCHERROR("kClearBornForces");
}
__global__ void kReduceBornSumAndForces_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
// Reduce forces
while (pos < cSim.stride4)
{
float totalForce = 0.0f;
float* pFt = (float*)cSim.pForce4 + pos;
int i = cSim.outputBuffers;
while (i >= 4)
{
float f1 = *pFt;
pFt += cSim.stride4;
float f2 = *pFt;
pFt += cSim.stride4;
float f3 = *pFt;
pFt += cSim.stride4;
float f4 = *pFt;
pFt += cSim.stride4;
totalForce += f1 + f2 + f3 + f4;
i -= 4;
}
if (i >= 2)
{
float f1 = *pFt;
pFt += cSim.stride4;
float f2 = *pFt;
pFt += cSim.stride4;
totalForce += f1 + f2;
i -= 2;
}
if (i > 0)
{
totalForce += *pFt;
}
pFt = (float*)cSim.pForce4 + pos;
*pFt = totalForce;
pos += gridDim.x * blockDim.x;
}
// Reduce Born Sum
while (pos - cSim.stride4 < cSim.atoms)
{
float sum = 0.0f;
float* pSt = cSim.pBornSum + pos - cSim.stride4;
float2 atom = cSim.pObcData[pos - cSim.stride4];
// Get summed Born data
int i = cSim.nonbondOutputBuffers;
while (i >= 4)
{
float f1 = *pSt;
pSt += cSim.stride;
float f2 = *pSt;
pSt += cSim.stride;
float f3 = *pSt;
pSt += cSim.stride;
float f4 = *pSt;
pSt += cSim.stride;
sum += f1 + f2 + f3 + f4;
i -= 4;
}
if (i >= 2)
{
float f1 = *pSt;
pSt += cSim.stride;
float f2 = *pSt;
pSt += cSim.stride;
sum += f1 + f2;
i -= 2;
}
if (i > 0)
{
sum += *pSt;
}
// Now calculate Born radius and OBC term.
cSim.pBornSum[pos - cSim.stride4] = sum;
sum *= 0.5f * atom.x;
float sum2 = sum * sum;
float sum3 = sum * sum2;
float tanhSum = tanh(cSim.alphaOBC * sum - cSim.betaOBC * sum2 + cSim.gammaOBC * sum3);
float nonOffsetRadii = atom.x + cSim.dielectricOffset;
float bornRadius = 1.0f / (1.0f / atom.x - tanhSum / nonOffsetRadii);
float obcChain = atom.x * (cSim.alphaOBC - 2.0f * cSim.betaOBC * sum + 3.0f * cSim.gammaOBC * sum2);
obcChain = (1.0f - tanhSum * tanhSum) * obcChain / nonOffsetRadii;
cSim.pBornRadii[pos - cSim.stride4] = bornRadius;
cSim.pObcChain[pos - cSim.stride4] = obcChain;
pos += gridDim.x * blockDim.x;
}
}
void kReduceBornSumAndForces(gpuContext gpu)
{
//printf("kReduceBornSumAndForces\n");
kReduceBornSumAndForces_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
LAUNCHERROR("kReduceBornSumAndForces");
#if 0
//gpuDumpObcLoop1( gpu );
/*
gpu->psForce4->Download();
for (int i = 0; i < gpu->natoms; i++)
{
printf("%4d: %12.6f %12.6f %12.6f\n", i,
gpu->psForce4->_pSysStream[0][i].x,
gpu->psForce4->_pSysStream[0][i].y,
gpu->psForce4->_pSysStream[0][i].z
);
} */
#endif
}
__global__ void kReduceForces_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
// Reduce forces
while (pos < cSim.stride4)
{
float totalForce = 0.0f;
float* pFt = (float*)cSim.pForce4 + pos;
int i = cSim.outputBuffers;
while (i >= 4)
{
float f1 = *pFt;
pFt += cSim.stride4;
float f2 = *pFt;
pFt += cSim.stride4;
float f3 = *pFt;
pFt += cSim.stride4;
float f4 = *pFt;
pFt += cSim.stride4;
totalForce += f1 + f2 + f3 + f4;
i -= 4;
}
if (i >= 2)
{
float f1 = *pFt;
pFt += cSim.stride4;
float f2 = *pFt;
pFt += cSim.stride4;
totalForce += f1 + f2;
i -= 2;
}
if (i > 0)
{
totalForce += *pFt;
}
pFt = (float*)cSim.pForce4 + pos;
*pFt = totalForce;
pos += gridDim.x * blockDim.x;
}
}
void kReduceForces(gpuContext gpu)
{
// printf("kReduceForces\n");
kReduceForces_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
LAUNCHERROR("kReduceForces");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
#include "gputypes.h"
static __constant__ cudaGmxSimulation cSim;
void SetRandomSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetRandomSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
extern __shared__ float3 sRand[];
__global__ void kGenerateRandoms_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int increment = blockDim.x * gridDim.x;
// Read generator state
uint4 state = cSim.pRandomSeed[pos];
unsigned int carry = 0;
float4 random4;
float2 random2;
while (pos < cSim.totalRandomsTimesTwo)
{
// Generate 6 randoms in GRF
unsigned int pos1 = threadIdx.x;
for (int i = 0; i < 2; i++)
{
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
unsigned int k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
unsigned int m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x1 = (float)max(state.x + state.y + state.w, 0x00000001) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
x1 = sqrt(-2.0f * log(x1));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x2 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
sRand[pos1].x = x1 * cos(2.0f * 3.14159265f * x2);
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x3 = (float)max(state.x + state.y + state.w, 0x00000001) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
x3 = sqrt(-2.0f * log(x3));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x4 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
sRand[pos1].y = x3 * cos(2.0f * 3.14159265f * x4);
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x5 = (float)max(state.x + state.y + state.w, 0x00000001) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
x5 = sqrt(-2.0f * log(x5));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x6 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
sRand[pos1].z = x5 * cos(2.0f * 3.14159265f * x6);
pos1 += blockDim.x;
}
// Output final randoms
float c1, c2;
if (pos < cSim.totalRandoms)
{
c1 = cSim.Yv;
c2 = cSim.V;
}
else
{
c1 = cSim.Yx;
c2 = cSim.X;
}
random4.x = c1 * sRand[threadIdx.x].x;
random4.y = c1 * sRand[threadIdx.x].y;
random4.z = c1 * sRand[threadIdx.x].z;
random4.w = c2 * sRand[threadIdx.x + blockDim.x].x;
cSim.pRandom4a[pos] = random4;
random2.x = c2 * sRand[threadIdx.x + blockDim.x].y;
random2.y = c2 * sRand[threadIdx.x + blockDim.x].z;
cSim.pRandom2a[pos] = random2;
pos += increment;
}
// Write generator state
pos = blockIdx.x * blockDim.x + threadIdx.x;
cSim.pRandomSeed[pos] = state;
}
void kGenerateRandoms(gpuContext gpu)
{
kGenerateRandoms_kernel<<<gpu->sim.blocks, gpu->sim.random_threads_per_block, gpu->sim.random_threads_per_block * 2 * sizeof(float3)>>>();
}
\ No newline at end of file
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using namespace std;
#define DeltaShake
#include "gputypes.h"
struct Atom
{
float3 rij1;
float3 rij2;
float3 rij3;
float M;
float d2;
float InvMassI;
float rij1sq;
float rij2sq;
float rij3sq;
};
static __constant__ cudaGmxSimulation cSim;
void SetUpdateShakeHSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetUpdateShakeHSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kUpdatePart1_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int rpos = cSim.pRandomPosition[blockIdx.x];
__syncthreads();
while (pos < cSim.atoms)
{
float4 velocity = cSim.pVelm4[pos];
float4 xVector = cSim.pxVector4[pos];
float4 random4a = cSim.pRandom4a[rpos + pos];
float2 random2a = cSim.pRandom2a[rpos + pos];
float4 apos = cSim.pPosq[pos];
float4 force = cSim.pForce4[pos];
float3 Vmh;
float sqrtInvMass = sqrt(velocity.w);
Vmh.x = xVector.x * cSim.DOverTauC + sqrtInvMass * random4a.x;
Vmh.y = xVector.y * cSim.DOverTauC + sqrtInvMass * random4a.y;
Vmh.z = xVector.z * cSim.DOverTauC + sqrtInvMass * random4a.z;
float4 vVector;
vVector.x = sqrtInvMass * random4a.w;
vVector.y = sqrtInvMass * random2a.x;
vVector.z = sqrtInvMass * random2a.y;
vVector.w = 0.0f;
cSim.pvVector4[pos] = vVector;
velocity.x = velocity.x * cSim.EM +
velocity.w * force.x * cSim.TauOneMinusEM +
vVector.x -
cSim.EM * Vmh.x;
velocity.y = velocity.y * cSim.EM +
velocity.w * force.y * cSim.TauOneMinusEM +
vVector.y -
cSim.EM * Vmh.y;
velocity.z = velocity.z * cSim.EM +
velocity.w * force.z * cSim.TauOneMinusEM +
vVector.z -
cSim.EM * Vmh.z;
cSim.pOldPosq[pos] = apos;
#ifndef DeltaShake
apos.x += velocity.x * cSim.fix1;
apos.y += velocity.y * cSim.fix1;
apos.z += velocity.z * cSim.fix1;
#else
apos.x = velocity.x * cSim.fix1;
apos.y = velocity.y * cSim.fix1;
apos.z = velocity.z * cSim.fix1;
#endif
cSim.pPosqP[pos] = apos;
cSim.pVelm4[pos] = velocity;
pos += blockDim.x * gridDim.x;
}
}
__global__ void kUpdatePart1CM_kernel()
{
extern __shared__ float3 sCM[];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int rpos = cSim.pRandomPosition[blockIdx.x];
float3 CM = { 0.0f, 0.0f, 0.0f};
float4 CM1 = { 0.0f, 0.0f, 0.0f, 0.0f };
// Read CM outputs from previous step
unsigned int cpos = threadIdx.x;
#if 0
float4 CM2 = { 0.0f, 0.0f, 0.0f, 0.0f };
float4 CM3 = { 0.0f, 0.0f, 0.0f, 0.0f };
float4 CM4 = { 0.0f, 0.0f, 0.0f, 0.0f };
if (cpos < gridDim.x)
CM1 = cSim.pLinearMomentum[cpos];
cpos += gridDim.x;
if (cpos < gridDim.x)
CM2 = cSim.pLinearMomentum[cpos];
cpos += gridDim.x;
if (cpos < gridDim.x)
CM3 = cSim.pLinearMomentum[cpos];
cpos += gridDim.x;
if (cpos < gridDim.x)
CM4 = cSim.pLinearMomentum[cpos];
sCM[threadIdx.x].x = CM1.x + CM2.x + CM3.x + CM4.x;
sCM[threadIdx.x].y = CM1.y + CM2.y + CM3.y + CM4.y;
sCM[threadIdx.x].z = CM1.z + CM2.z + CM3.z + CM4.z;
#else
while (cpos < gridDim.x)
{
CM1 = cSim.pLinearMomentum[cpos];
CM.x += CM1.x;
CM.y += CM1.y;
CM.z += CM1.z;
cpos += blockDim.x;
}
sCM[threadIdx.x].x = CM.x;
sCM[threadIdx.x].y = CM.y;
sCM[threadIdx.x].z = CM.z;
#endif
__syncthreads();
// Reduce CM
unsigned int offset = 1;
unsigned int mask = 1;
while (offset < blockDim.x)
{
if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
{
sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
}
mask = 2 * mask + 1;
offset *= 2;
__syncthreads();
}
while (pos < cSim.atoms)
{
float4 velocity = cSim.pVelm4[pos];
float4 xVector = cSim.pxVector4[pos];
float4 random4a = cSim.pRandom4a[rpos + pos];
float2 random2a = cSim.pRandom2a[rpos + pos];
float4 apos = cSim.pPosq[pos];
float4 force = cSim.pForce4[pos];
float3 Vmh;
float sqrtInvMass = sqrt(velocity.w);
Vmh.x = xVector.x * cSim.DOverTauC + sqrtInvMass * random4a.x;
Vmh.y = xVector.y * cSim.DOverTauC + sqrtInvMass * random4a.y;
Vmh.z = xVector.z * cSim.DOverTauC + sqrtInvMass * random4a.z;
float4 vVector;
vVector.x = sqrtInvMass * random4a.w;
vVector.y = sqrtInvMass * random2a.x;
vVector.z = sqrtInvMass * random2a.y;
vVector.w = 0.0f;
cSim.pvVector4[pos] = vVector;
velocity.x = velocity.x * cSim.EM +
velocity.w * force.x * cSim.TauOneMinusEM +
vVector.x -
cSim.EM * Vmh.x -
sCM[0].x;
velocity.y = velocity.y * cSim.EM +
velocity.w * force.y * cSim.TauOneMinusEM +
vVector.y -
cSim.EM * Vmh.y -
sCM[0].y;
velocity.z = velocity.z * cSim.EM +
velocity.w * force.z * cSim.TauOneMinusEM +
vVector.z -
cSim.EM * Vmh.z -
sCM[0].z;
cSim.pOldPosq[pos] = apos;
#ifndef DeltaShake
apos.x += velocity.x * cSim.fix1;
apos.y += velocity.y * cSim.fix1;
apos.z += velocity.z * cSim.fix1;
#else
apos.x = velocity.x * cSim.fix1;
apos.y = velocity.y * cSim.fix1;
apos.z = velocity.z * cSim.fix1;
#endif
cSim.pPosqP[pos] = apos;
cSim.pVelm4[pos] = velocity;
pos += blockDim.x * gridDim.x;
}
}
void kUpdatePart1(gpuContext gpu)
{
// printf("kUpdatePart1\n");
#if 0
static int iteration = 0;
if (iteration == 0)
{
gpu->psPosq4->Download();
gpu->psVelm4->Download();
printf("# %d atoms\n", gpu->natoms);
for (int i = 0; i < gpu->natoms; i++)
{
printf("%5d %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i,
gpu->psPosq4->_pSysStream[0][i].x, gpu->psPosq4->_pSysStream[0][i].y,
gpu->psPosq4->_pSysStream[0][i].z, gpu->psPosq4->_pSysStream[0][i].w,
gpu->psVelm4->_pSysStream[0][i].x, gpu->psVelm4->_pSysStream[0][i].y,
gpu->psVelm4->_pSysStream[0][i].z, gpu->psVelm4->_pSysStream[0][i].w
);
}
}
iteration++;
#endif
#if 0
static const float KILO = 1e3; // Thousand
static const float BOLTZMANN = 1.380658e-23f; // (J/K)
static const float AVOGADRO = 6.0221367e23f; // ()
static const float RGAS = BOLTZMANN * AVOGADRO; // (J/(mol K))
static const float BOLTZ = (RGAS / KILO); // (kJ/(mol K))
static int iteration = 0;
// Check T
if (iteration % 1000 == 0)
{
gpu->psVelm4->Download();
float ke = 0.0f;
for (int i = 0; i < gpu->natoms; i++)
{
float vx = gpu->psVelm4->_pSysStream[0][i].x;
float vy = gpu->psVelm4->_pSysStream[0][i].y;
float vz = gpu->psVelm4->_pSysStream[0][i].z;
float m = 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
ke += m * (vx * vx + vy * vy + vz * vz);
}
float T = ke / (BOLTZ * gpu->sim.degreesOfFreedom);
printf("Iteration %d, Temperature is %f\n", iteration, T);
}
iteration++;
#endif
if (gpu->bRemoveCM)
{
kUpdatePart1CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
LAUNCHERROR("kUpdatePart1CM");
gpu->bRemoveCM = false;
#if 0
gpu->psLinearMomentum->Download();
gpu->psVelm4->Download();
float3 mv = {0.0f, 0.0f, 0.0f};
for (int i = 0; i < gpu->natoms; i++)
{
float mass = 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
mv.x += mass * gpu->psVelm4->_pSysStream[0][i].x;
mv.y += mass * gpu->psVelm4->_pSysStream[0][i].y;
mv.z += mass * gpu->psVelm4->_pSysStream[0][i].z;
}
mv.x *= gpu->sim.inverseTotalMass;
mv.y *= gpu->sim.inverseTotalMass;
mv.z *= gpu->sim.inverseTotalMass;
float3 mv1 = {0.0f, 0.0f, 0.0f};
for (int i = 0; i < gpu->sim.blocks; i++)
{
mv1.x += gpu->psLinearMomentum->_pSysStream[0][i].x;
mv1.y += gpu->psLinearMomentum->_pSysStream[0][i].y;
mv1.z += gpu->psLinearMomentum->_pSysStream[0][i].z;
}
printf("%11.5f %11.5f %11.5f | %11.5f %11.5f %11.5f\n", mv.x, mv.y, mv.z, mv1.x, mv1.y, mv1.z);
#endif
}
else
{
kUpdatePart1_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kUpdatePart1");
}
}
__global__ void kApplyFirstShake_kernel()
{
__shared__ Atom sA[G8X_THREADS_PER_BLOCK];
Atom* psA = &sA[threadIdx.x];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.ShakeConstraints)
{
int4 atomID = cSim.pShakeID[pos];
float4 params = cSim.pShakeParameter[pos];
float4 apos = cSim.pOldPosq[atomID.x];
float4 xpi = cSim.pPosqP[atomID.x];
float4 apos1 = cSim.pOldPosq[atomID.y];
float4 xpj1 = cSim.pPosqP[atomID.y];
float4 apos2 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 xpj2 = {0.0f, 0.0f, 0.0f, 0.0f};
psA->InvMassI = params.x;
psA->M = params.y;
psA->d2 = params.z;
float invMassJ = params.w;
if (atomID.z != -1)
{
apos2 = cSim.pOldPosq[atomID.z];
xpj2 = cSim.pPosqP[atomID.z];
}
float4 apos3 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 xpj3 = {0.0f, 0.0f, 0.0f, 0.0f};
if (atomID.w != -1)
{
apos3 = cSim.pOldPosq[atomID.w];
xpj3 = cSim.pPosqP[atomID.w];
}
float3 xi, xj1, xj2, xj3;
xi.x = apos.x;
xi.y = apos.y;
xi.z = apos.z;
xj1.x = apos1.x;
xj1.y = apos1.y;
xj1.z = apos1.z;
xj2.x = apos2.x;
xj2.y = apos2.y;
xj2.z = apos2.z;
xj3.x = apos3.x;
xj3.y = apos3.y;
xj3.z = apos3.z;
#ifndef DeltaShake
xpi.x -= xi.x;
xpi.y -= xi.y;
xpi.z -= xi.z;
xpj1.x -= xj1.x;
xpj1.y -= xj1.y;
xpj1.z -= xj1.z;
xpj2.x -= xj2.x;
xpj2.y -= xj2.y;
xpj2.z -= xj2.z;
xpj3.x -= xj3.x;
xpj3.y -= xj3.y;
xpj3.z -= xj3.z;
#endif
psA->rij1.x = xi.x - xj1.x;
psA->rij1.y = xi.y - xj1.y;
psA->rij1.z = xi.z - xj1.z;
psA->rij2.x = xi.x - xj2.x;
psA->rij2.y = xi.y - xj2.y;
psA->rij2.z = xi.z - xj2.z;
psA->rij3.x = xi.x - xj3.x;
psA->rij3.y = xi.y - xj3.y;
psA->rij3.z = xi.z - xj3.z;
psA->rij1sq = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z;
psA->rij2sq = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z;
psA->rij3sq = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z;
float ld1 = psA->d2 - psA->rij1sq;
float ld2 = psA->d2 - psA->rij2sq;
float ld3 = psA->d2 - psA->rij3sq;
bool converged = false;
int iteration = 0;
while (iteration < 15 && !converged)
{
converged = true;
float3 rpij;
rpij.x = xpi.x - xpj1.x;
rpij.y = xpi.y - xpj1.y;
rpij.z = xpi.z - xpj1.z;
float rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
float rrpr = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z;
float diff = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
if (diff >= 1.0f)
{
float acor = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq);
float3 dr;
dr.x = psA->rij1.x * acor;
dr.y = psA->rij1.y * acor;
dr.z = psA->rij1.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj1.x -= dr.x * invMassJ;
xpj1.y -= dr.y * invMassJ;
xpj1.z -= dr.z * invMassJ;
converged = false;
}
if (atomID.z != -1)
{
rpij.x = xpi.x - xpj2.x;
rpij.y = xpi.y - xpj2.y;
rpij.z = xpi.z - xpj2.z;
rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
rrpr = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z;
diff = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
if (diff >= 1.0f)
{
float acor = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq);
float3 dr;
dr.x = psA->rij2.x * acor;
dr.y = psA->rij2.y * acor;
dr.z = psA->rij2.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj2.x -= dr.x * invMassJ;
xpj2.y -= dr.y * invMassJ;
xpj2.z -= dr.z * invMassJ;
converged = false;
}
}
if (atomID.w != -1)
{
rpij.x = xpi.x - xpj3.x;
rpij.y = xpi.y - xpj3.y;
rpij.z = xpi.z - xpj3.z;
rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
rrpr = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z;
diff = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
if (diff >= 1.0f)
{
float acor = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq);
float3 dr;
dr.x = psA->rij3.x * acor;
dr.y = psA->rij3.y * acor;
dr.z = psA->rij3.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj3.x -= dr.x * invMassJ;
xpj3.y -= dr.y * invMassJ;
xpj3.z -= dr.z * invMassJ;
converged = false;
}
}
iteration++;
}
#ifndef DeltaShake
xpi.x += xi.x;
xpi.y += xi.y;
xpi.z += xi.z;
xpj1.x += xj1.x;
xpj1.y += xj1.y;
xpj1.z += xj1.z;
xpj2.x += xj2.x;
xpj2.y += xj2.y;
xpj2.z += xj2.z;
xpj3.x += xj3.x;
xpj3.y += xj3.y;
xpj3.z += xj3.z;
#endif
cSim.pPosqP[atomID.x] = xpi;
cSim.pPosqP[atomID.y] = xpj1;
if (atomID.z != -1)
cSim.pPosqP[atomID.z] = xpj2;
if (atomID.w != -1)
cSim.pPosqP[atomID.w] = xpj3;
pos += blockDim.x * gridDim.x;
}
}
void kApplyFirstShake(gpuContext gpu)
{
// printf("kApplyFirstShake\n");
if (gpu->sim.ShakeConstraints > 0)
{
kApplyFirstShake_kernel<<<gpu->sim.blocks, gpu->sim.shake_threads_per_block>>>();
LAUNCHERROR("kApplyFirstShake");
}
}
__global__ void kUpdatePart2_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int rpos = cSim.pRandomPosition[blockIdx.x];
__syncthreads();
while (pos < cSim.atoms)
{
float4 velocity = cSim.pVelm4[pos];
#ifndef DeltaShake
float4 apos = cSim.pPosq[pos];
#endif
float4 xPrime = cSim.pPosqP[pos];
float4 vVector = cSim.pvVector4[pos];
float4 xVector;
float4 random4b = cSim.pRandom4b[rpos + pos];
float2 random2b = cSim.pRandom2b[rpos + pos];
float3 Xmh;
float sqrtInvMass = sqrt(velocity.w);
#ifdef DeltaShake
velocity.x = xPrime.x * cSim.oneOverFix1;
velocity.y = xPrime.y * cSim.oneOverFix1;
velocity.z = xPrime.z * cSim.oneOverFix1;
#else
velocity.x = (xPrime.x - apos.x) * cSim.oneOverFix1;
velocity.y = (xPrime.y - apos.y) * cSim.oneOverFix1;
velocity.z = (xPrime.z - apos.z) * cSim.oneOverFix1;
#endif
Xmh.x = vVector.x * cSim.TauDOverEMMinusOne +
sqrtInvMass * random4b.x;
Xmh.y = vVector.y * cSim.TauDOverEMMinusOne +
sqrtInvMass * random4b.y;
Xmh.z = vVector.z * cSim.TauDOverEMMinusOne +
sqrtInvMass * random4b.z;
xVector.x = sqrtInvMass * random4b.w;
xVector.y = sqrtInvMass * random2b.x;
xVector.z = sqrtInvMass * random2b.y;
xPrime.x += xVector.x - Xmh.x;
xPrime.y += xVector.y - Xmh.y;
xPrime.z += xVector.z - Xmh.z;
cSim.pPosq[pos] = xPrime;
cSim.pVelm4[pos] = velocity;
cSim.pxVector4[pos] = xVector;
pos += blockDim.x * gridDim.x;
}
// Update random position pointer
if (threadIdx.x == 0)
{
rpos += cSim.paddedNumberOfAtoms;
if (rpos > cSim.randoms)
rpos -= cSim.randoms;
cSim.pRandomPosition[blockIdx.x] = rpos;
}
}
__global__ void kUpdatePart2CM_kernel()
{
extern __shared__ float3 sCM[];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int rpos = cSim.pRandomPosition[blockIdx.x];
float3 CM = {0.0f, 0.0f, 0.0f};
__syncthreads();
while (pos < cSim.atoms)
{
float4 velocity = cSim.pVelm4[pos];
#ifndef DeltaShake
float4 apos = cSim.pPosq[pos];
#endif
float4 xPrime = cSim.pPosqP[pos];
float4 vVector = cSim.pvVector4[pos];
float4 xVector;
float4 random4b = cSim.pRandom4b[rpos + pos];
float2 random2b = cSim.pRandom2b[rpos + pos];
float3 Xmh;
float mass = 1.0f / velocity.w;
float sqrtInvMass = sqrt(velocity.w);
#ifdef DeltaShake
velocity.x = xPrime.x * cSim.oneOverFix1;
velocity.y = xPrime.y * cSim.oneOverFix1;
velocity.z = xPrime.z * cSim.oneOverFix1;
#else
velocity.x = (xPrime.x - apos.x) * cSim.oneOverFix1;
velocity.y = (xPrime.y - apos.y) * cSim.oneOverFix1;
velocity.z = (xPrime.z - apos.z) * cSim.oneOverFix1;
#endif
CM.x += mass * velocity.x;
CM.y += mass * velocity.y;
CM.z += mass * velocity.z;
Xmh.x = vVector.x * cSim.TauDOverEMMinusOne +
sqrtInvMass * random4b.x;
Xmh.y = vVector.y * cSim.TauDOverEMMinusOne +
sqrtInvMass * random4b.y;
Xmh.z = vVector.z * cSim.TauDOverEMMinusOne +
sqrtInvMass * random4b.z;
xVector.x = sqrtInvMass * random4b.w;
xVector.y = sqrtInvMass * random2b.x;
xVector.z = sqrtInvMass * random2b.y;
xPrime.x += xVector.x - Xmh.x;
xPrime.y += xVector.y - Xmh.y;
xPrime.z += xVector.z - Xmh.z;
cSim.pPosq[pos] = xPrime;
cSim.pVelm4[pos] = velocity;
cSim.pxVector4[pos] = xVector;
pos += blockDim.x * gridDim.x;
}
// Update random position pointer
if (threadIdx.x == 0)
{
rpos += cSim.paddedNumberOfAtoms;
if (rpos > cSim.randoms)
rpos -= cSim.randoms;
cSim.pRandomPosition[blockIdx.x] = rpos;
}
// Scale CM
CM.x *= cSim.inverseTotalMass;
CM.y *= cSim.inverseTotalMass;
CM.z *= cSim.inverseTotalMass;
sCM[threadIdx.x] = CM;
__syncthreads();
// Reduce CM for CTA
unsigned int offset = 1;
unsigned int mask = 1;
while (offset < blockDim.x)
{
if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
{
sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
}
mask = 2 * mask + 1;
offset *= 2;
__syncthreads();
}
if (threadIdx.x == 0)
{
float4 CM;
CM.x = sCM[0].x;
CM.y = sCM[0].y;
CM.z = sCM[0].z;
CM.w = 0.0f;
cSim.pLinearMomentum[blockIdx.x] = CM;
}
}
extern void kGenerateRandoms(gpuContext gpu);
void kUpdatePart2(gpuContext gpu)
{
// printf("kUpdatePart2\n");
if (gpu->bCalculateCM)
{
kUpdatePart2CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
LAUNCHERROR("kUpdatePart2CM");
gpu->bCalculateCM = false;
gpu->bRemoveCM = true;
#if 0
gpu->psLinearMomentum->Download();
gpu->psVelm4->Download();
float3 mv = {0.0f, 0.0f, 0.0f};
for (int i = 0; i < gpu->natoms; i++)
{
float mass = 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
mv.x += mass * gpu->psVelm4->_pSysStream[0][i].x;
mv.y += mass * gpu->psVelm4->_pSysStream[0][i].y;
mv.z += mass * gpu->psVelm4->_pSysStream[0][i].z;
}
mv.x *= gpu->sim.inverseTotalMass;
mv.y *= gpu->sim.inverseTotalMass;
mv.z *= gpu->sim.inverseTotalMass;
float3 mv1 = {0.0f, 0.0f, 0.0f};
for (int i = 0; i < gpu->sim.blocks; i++)
{
mv1.x += gpu->psLinearMomentum->_pSysStream[0][i].x;
mv1.y += gpu->psLinearMomentum->_pSysStream[0][i].y;
mv1.z += gpu->psLinearMomentum->_pSysStream[0][i].z;
}
printf("%11.5f %11.5f %11.5f | %11.5f %11.5f %11.5f\n", mv.x, mv.y, mv.z, mv1.x, mv1.y, mv1.z);
#endif
}
else
{
kUpdatePart2_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kUpdatePart2");
}
// Update randoms if necessary
static int iteration = 0;
iteration++;
if (iteration == gpu->sim.randomIterations)
{
kGenerateRandoms(gpu);
iteration = 0;
}
}
__global__ void kApplySecondShake_kernel()
{
__shared__ Atom sA[G8X_THREADS_PER_BLOCK];
Atom* psA = &sA[threadIdx.x];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.ShakeConstraints)
{
int4 atomID = cSim.pShakeID[pos];
float4 params = cSim.pShakeParameter[pos];
float4 apos = cSim.pOldPosq[atomID.x];
float4 xpi = cSim.pPosq[atomID.x];
float4 apos1 = cSim.pOldPosq[atomID.y];
float4 xpj1 = cSim.pPosq[atomID.y];
float4 apos2 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 xpj2 = {0.0f, 0.0f, 0.0f, 0.0f};
psA->InvMassI = params.x;
psA->M = params.y;
psA->d2 = params.z;
float invMassJ = params.w;
if (atomID.z != -1)
{
apos2 = cSim.pOldPosq[atomID.z];
xpj2 = cSim.pPosq[atomID.z];
}
float4 apos3 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 xpj3 = {0.0f, 0.0f, 0.0f, 0.0f};
if (atomID.w != -1)
{
apos3 = cSim.pOldPosq[atomID.w];
xpj3 = cSim.pPosq[atomID.w];
}
float3 xi, xj1, xj2, xj3;
xi.x = apos.x;
xi.y = apos.y;
xi.z = apos.z;
xj1.x = apos1.x;
xj1.y = apos1.y;
xj1.z = apos1.z;
xj2.x = apos2.x;
xj2.y = apos2.y;
xj2.z = apos2.z;
xj3.x = apos3.x;
xj3.y = apos3.y;
xj3.z = apos3.z;
#ifndef DeltaShake
xpi.x -= xi.x;
xpi.y -= xi.y;
xpi.z -= xi.z;
xpj1.x -= xj1.x;
xpj1.y -= xj1.y;
xpj1.z -= xj1.z;
xpj2.x -= xj2.x;
xpj2.y -= xj2.y;
xpj2.z -= xj2.z;
xpj3.x -= xj3.x;
xpj3.y -= xj3.y;
xpj3.z -= xj3.z;
#endif
psA->rij1.x = xi.x - xj1.x;
psA->rij1.y = xi.y - xj1.y;
psA->rij1.z = xi.z - xj1.z;
psA->rij2.x = xi.x - xj2.x;
psA->rij2.y = xi.y - xj2.y;
psA->rij2.z = xi.z - xj2.z;
psA->rij3.x = xi.x - xj3.x;
psA->rij3.y = xi.y - xj3.y;
psA->rij3.z = xi.z - xj3.z;
psA->rij1sq = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z;
psA->rij2sq = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z;
psA->rij3sq = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z;
float ld1 = psA->d2 - psA->rij1sq;
float ld2 = psA->d2 - psA->rij2sq;
float ld3 = psA->d2 - psA->rij3sq;
bool converged = false;
int iteration = 0;
while (iteration < 15 && !converged)
{
converged = true;
float3 rpij;
rpij.x = xpi.x - xpj1.x;
rpij.y = xpi.y - xpj1.y;
rpij.z = xpi.z - xpj1.z;
float rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
float rrpr = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z;
float diff = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
if (diff >= 1.0f)
{
float acor = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq);
float3 dr;
dr.x = psA->rij1.x * acor;
dr.y = psA->rij1.y * acor;
dr.z = psA->rij1.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj1.x -= dr.x * invMassJ;
xpj1.y -= dr.y * invMassJ;
xpj1.z -= dr.z * invMassJ;
converged = false;
}
if (atomID.z != -1)
{
rpij.x = xpi.x - xpj2.x;
rpij.y = xpi.y - xpj2.y;
rpij.z = xpi.z - xpj2.z;
rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
rrpr = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z;
diff = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
if (diff >= 1.0f)
{
float acor = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq);
float3 dr;
dr.x = psA->rij2.x * acor;
dr.y = psA->rij2.y * acor;
dr.z = psA->rij2.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj2.x -= dr.x * invMassJ;
xpj2.y -= dr.y * invMassJ;
xpj2.z -= dr.z * invMassJ;
converged = false;
}
}
if (atomID.w != -1)
{
rpij.x = xpi.x - xpj3.x;
rpij.y = xpi.y - xpj3.y;
rpij.z = xpi.z - xpj3.z;
rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
rrpr = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z;
diff = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
if (diff >= 1.0f)
{
float acor = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq);
float3 dr;
dr.x = psA->rij3.x * acor;
dr.y = psA->rij3.y * acor;
dr.z = psA->rij3.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj3.x -= dr.x * invMassJ;
xpj3.y -= dr.y * invMassJ;
xpj3.z -= dr.z * invMassJ;
converged = false;
}
}
iteration++;
}
xpi.x += xi.x;
xpi.y += xi.y;
xpi.z += xi.z;
xpj1.x += xj1.x;
xpj1.y += xj1.y;
xpj1.z += xj1.z;
xpj2.x += xj2.x;
xpj2.y += xj2.y;
xpj2.z += xj2.z;
xpj3.x += xj3.x;
xpj3.y += xj3.y;
xpj3.z += xj3.z;
cSim.pPosq[atomID.x] = xpi;
cSim.pPosq[atomID.y] = xpj1;
if (atomID.z != -1)
cSim.pPosq[atomID.z] = xpj2;
if (atomID.w != -1)
cSim.pPosq[atomID.w] = xpj3;
pos += blockDim.x * gridDim.x;
}
}
__global__ void kApplyNoShake_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.NonShakeConstraints)
{
int atomID = cSim.pNonShakeID[pos];
float4 apos = cSim.pOldPosq[atomID];
float4 xpi = cSim.pPosq[atomID];
xpi.x += apos.x;
xpi.y += apos.y;
xpi.z += apos.z;
cSim.pPosq[atomID] = xpi;
pos += blockDim.x * gridDim.x;
}
}
void kCPUShake2(gpuContext gpu)
{
}
void kApplySecondShake(gpuContext gpu)
{
// printf("kApplySecondShake\n");
// kCPUShake2(gpu);
if (gpu->sim.ShakeConstraints > 0)
{
kApplySecondShake_kernel<<<gpu->sim.blocks, gpu->sim.shake_threads_per_block>>>();
LAUNCHERROR("kApplySecondShake");
}
// handle non-Shake atoms
#ifdef DeltaShake
if (gpu->sim.NonShakeConstraints > 0)
{
//fprintf( gpu->log, "kApplyNoShake_kernel %d %d \n", gpu->sim.blocks, gpu->sim.nonshake_threads_per_block); fflush( gpu->log );
kApplyNoShake_kernel<<<gpu->sim.blocks, gpu->sim.nonshake_threads_per_block>>>();
LAUNCHERROR("kApplyNoShake");
}
#endif
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
//#include <fstream>
using namespace std;
#include "gputypes.h"
#define DeltaShake
static __constant__ cudaGmxSimulation cSim;
void SetVerletUpdateSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetVerletUpdateSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
__global__ void kVerletUpdatePart1_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
__syncthreads();
while (pos < cSim.atoms)
{
float4 apos = cSim.pPosq[pos];
float4 velocity = cSim.pVelm4[pos];
float4 force = cSim.pForce4[pos];
float dtOverMass = cSim.deltaT*velocity.w;
cSim.pOldPosq[pos] = apos;
velocity.x += dtOverMass*force.x;
velocity.y += dtOverMass*force.y;
velocity.z += dtOverMass*force.z;
#ifndef DeltaShake
apos.x += velocity.x*cSim.deltaT;
apos.y += velocity.y*cSim.deltaT;
apos.z += velocity.z*cSim.deltaT;
#else
apos.x = velocity.x*cSim.deltaT;
apos.y = velocity.y*cSim.deltaT;
apos.z = velocity.z*cSim.deltaT;
#endif
cSim.pPosqP[pos] = apos;
cSim.pVelm4[pos] = velocity;
pos += blockDim.x * gridDim.x;
}
}
__global__ void kVerletUpdatePart1CM_kernel()
{
extern __shared__ float3 sCM[];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
float3 CM = { 0.0f, 0.0f, 0.0f};
float4 CM1 = { 0.0f, 0.0f, 0.0f, 0.0f };
// Read CM outputs from previous step
unsigned int cpos = threadIdx.x;
while (cpos < gridDim.x)
{
CM1 = cSim.pLinearMomentum[cpos];
CM.x += CM1.x;
CM.y += CM1.y;
CM.z += CM1.z;
cpos += blockDim.x;
}
sCM[threadIdx.x].x = CM.x;
sCM[threadIdx.x].y = CM.y;
sCM[threadIdx.x].z = CM.z;
__syncthreads();
// Reduce CM
unsigned int offset = 1;
unsigned int mask = 1;
while (offset < blockDim.x)
{
if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
{
sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
}
mask = 2 * mask + 1;
offset *= 2;
__syncthreads();
}
while (pos < cSim.atoms)
{
float4 apos = cSim.pPosq[pos];
float4 velocity = cSim.pVelm4[pos];
float4 force = cSim.pForce4[pos];
float dtOverMass = cSim.deltaT*velocity.w;
cSim.pOldPosq[pos] = apos;
velocity.x += dtOverMass*force.x-sCM[0].x;
velocity.y += dtOverMass*force.y-sCM[0].y;
velocity.z += dtOverMass*force.z-sCM[0].z;
#ifndef DeltaShake
apos.x += velocity.x*cSim.deltaT;
apos.y += velocity.y*cSim.deltaT;
apos.z += velocity.z*cSim.deltaT;
#else
apos.x = velocity.x*cSim.deltaT;
apos.y = velocity.y*cSim.deltaT;
apos.z = velocity.z*cSim.deltaT;
#endif
cSim.pPosqP[pos] = apos;
cSim.pVelm4[pos] = velocity;
pos += blockDim.x * gridDim.x;
}
}
void kVerletUpdatePart1(gpuContext gpu)
{
// printf("kVerletUpdatePart1\n");
if (gpu->bRemoveCM)
{
kVerletUpdatePart1CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
LAUNCHERROR("kVerletUpdatePart1CM");
gpu->bRemoveCM = false;
}
else
{
kVerletUpdatePart1_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kVerletUpdatePart1");
}
}
__global__ void kVerletUpdatePart2_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
__syncthreads();
while (pos < cSim.atoms)
{
float4 velocity = cSim.pVelm4[pos];
float4 apos = cSim.pPosq[pos];
float4 xPrime = cSim.pPosqP[pos];
#ifndef DeltaShake
velocity.x = cSim.oneOverDeltaT*(xPrime.x-apos.x);
velocity.y = cSim.oneOverDeltaT*(xPrime.y-apos.y);
velocity.z = cSim.oneOverDeltaT*(xPrime.z-apos.z);
#else
velocity.x = cSim.oneOverDeltaT*(xPrime.x);
velocity.y = cSim.oneOverDeltaT*(xPrime.y);
velocity.z = cSim.oneOverDeltaT*(xPrime.z);
xPrime.x += apos.x;
xPrime.y += apos.y;
xPrime.z += apos.z;
#endif
cSim.pPosq[pos] = xPrime;
cSim.pVelm4[pos] = velocity;
pos += blockDim.x * gridDim.x;
}
}
__global__ void kVerletUpdatePart2CM_kernel()
{
extern __shared__ float3 sCM[];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
float3 CM = {0.0f, 0.0f, 0.0f};
__syncthreads();
while (pos < cSim.atoms)
{
float4 velocity = cSim.pVelm4[pos];
float4 apos = cSim.pPosq[pos];
float4 xPrime = cSim.pPosqP[pos];
float mass = 1.0f / velocity.w;
#ifndef DeltaShake
velocity.x = cSim.oneOverDeltaT*(xPrime.x-apos.x);
velocity.y = cSim.oneOverDeltaT*(xPrime.y-apos.y);
velocity.z = cSim.oneOverDeltaT*(xPrime.z-apos.z);
#else
velocity.x = cSim.oneOverDeltaT*(xPrime.x);
velocity.y = cSim.oneOverDeltaT*(xPrime.y);
velocity.z = cSim.oneOverDeltaT*(xPrime.z);
xPrime.x += apos.x;
xPrime.y += apos.y;
xPrime.z += apos.z;
#endif
CM.x += mass * velocity.x;
CM.y += mass * velocity.y;
CM.z += mass * velocity.z;
cSim.pPosq[pos] = xPrime;
cSim.pVelm4[pos] = velocity;
pos += blockDim.x * gridDim.x;
}
// Scale CM
CM.x *= cSim.inverseTotalMass;
CM.y *= cSim.inverseTotalMass;
CM.z *= cSim.inverseTotalMass;
sCM[threadIdx.x] = CM;
__syncthreads();
// Reduce CM for CTA
unsigned int offset = 1;
unsigned int mask = 1;
while (offset < blockDim.x)
{
if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
{
sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
}
mask = 2 * mask + 1;
offset *= 2;
__syncthreads();
}
if (threadIdx.x == 0)
{
float4 CM;
CM.x = sCM[0].x;
CM.y = sCM[0].y;
CM.z = sCM[0].z;
CM.w = 0.0f;
cSim.pLinearMomentum[blockIdx.x] = CM;
}
}
void kVerletUpdatePart2(gpuContext gpu)
{
// printf("kVerletUpdatePart2\n");
if (gpu->bCalculateCM)
{
kVerletUpdatePart2CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
LAUNCHERROR("kVerletUpdatePart2CM");
gpu->bCalculateCM = false;
gpu->bRemoveCM = true;
}
else
{
kVerletUpdatePart2_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kVerletUpdatePart2");
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment