Commit 3749e84b authored by Peter Eastman's avatar Peter Eastman
Browse files

Various cleanups to integration code

parent 1ef6e1d8
...@@ -39,7 +39,7 @@ ...@@ -39,7 +39,7 @@
namespace OpenMM { namespace OpenMM {
/** /**
* This is an Integrator which simulates a System using the velocity Verlet algorithm. * This is an Integrator which simulates a System using the leap-frog Verlet algorithm.
*/ */
class OPENMM_EXPORT VerletIntegrator : public Integrator { class OPENMM_EXPORT VerletIntegrator : public Integrator {
......
...@@ -488,7 +488,7 @@ void CudaIntegrateLangevinStepKernel::execute(OpenMMContextImpl& context, const ...@@ -488,7 +488,7 @@ void CudaIntegrateLangevinStepKernel::execute(OpenMMContextImpl& context, const
prevFriction = friction; prevFriction = friction;
prevStepSize = stepSize; prevStepSize = stepSize;
} }
kUpdatePart1(gpu); kLangevinUpdatePart1(gpu);
kApplyFirstShake(gpu); kApplyFirstShake(gpu);
kApplyFirstSettle(gpu); kApplyFirstSettle(gpu);
kApplyFirstCCMA(gpu); kApplyFirstCCMA(gpu);
...@@ -497,7 +497,7 @@ void CudaIntegrateLangevinStepKernel::execute(OpenMMContextImpl& context, const ...@@ -497,7 +497,7 @@ void CudaIntegrateLangevinStepKernel::execute(OpenMMContextImpl& context, const
if (step%data.cmMotionFrequency == 0) if (step%data.cmMotionFrequency == 0)
gpu->bCalculateCM = true; gpu->bCalculateCM = true;
} }
kUpdatePart2(gpu); kLangevinUpdatePart2(gpu);
kApplySecondShake(gpu); kApplySecondShake(gpu);
kApplySecondSettle(gpu); kApplySecondSettle(gpu);
kApplySecondCCMA(gpu); kApplySecondCCMA(gpu);
......
...@@ -40,14 +40,14 @@ extern void kCalculateObcGbsaForces2(gpuContext gpu); ...@@ -40,14 +40,14 @@ extern void kCalculateObcGbsaForces2(gpuContext gpu);
extern void kCalculateLocalForces(gpuContext gpu); extern void kCalculateLocalForces(gpuContext gpu);
extern void kCalculateAndersenThermostat(gpuContext gpu); extern void kCalculateAndersenThermostat(gpuContext gpu);
extern void kReduceBornSumAndForces(gpuContext gpu); extern void kReduceBornSumAndForces(gpuContext gpu);
extern void kUpdatePart1(gpuContext gpu);
extern void kApplyFirstShake(gpuContext gpu); extern void kApplyFirstShake(gpuContext gpu);
extern void kApplyFirstCCMA(gpuContext gpu); extern void kApplyFirstCCMA(gpuContext gpu);
extern void kApplyFirstSettle(gpuContext gpu); extern void kApplyFirstSettle(gpuContext gpu);
extern void kUpdatePart2(gpuContext gpu);
extern void kApplySecondShake(gpuContext gpu); extern void kApplySecondShake(gpuContext gpu);
extern void kApplySecondCCMA(gpuContext gpu); extern void kApplySecondCCMA(gpuContext gpu);
extern void kApplySecondSettle(gpuContext gpu); extern void kApplySecondSettle(gpuContext gpu);
extern void kLangevinUpdatePart1(gpuContext gpu);
extern void kLangevinUpdatePart2(gpuContext gpu);
extern void kVerletUpdatePart1(gpuContext gpu); extern void kVerletUpdatePart1(gpuContext gpu);
extern void kVerletUpdatePart2(gpuContext gpu); extern void kVerletUpdatePart2(gpuContext gpu);
extern void kBrownianUpdatePart1(gpuContext gpu); extern void kBrownianUpdatePart1(gpuContext gpu);
...@@ -72,8 +72,10 @@ extern void SetCalculateAndersenThermostatSim(gpuContext gpu); ...@@ -72,8 +72,10 @@ extern void SetCalculateAndersenThermostatSim(gpuContext gpu);
extern void GetCalculateAndersenThermostatSim(gpuContext gpu); extern void GetCalculateAndersenThermostatSim(gpuContext gpu);
extern void SetForcesSim(gpuContext gpu); extern void SetForcesSim(gpuContext gpu);
extern void GetForcesSim(gpuContext gpu); extern void GetForcesSim(gpuContext gpu);
extern void SetUpdateShakeHSim(gpuContext gpu); extern void SetShakeHSim(gpuContext gpu);
extern void GetUpdateShakeHSim(gpuContext gpu); extern void GetShakeHSim(gpuContext gpu);
extern void SetLangevinUpdateSim(gpuContext gpu);
extern void GetLangevinUpdateSim(gpuContext gpu);
extern void SetSettleSim(gpuContext gpu); extern void SetSettleSim(gpuContext gpu);
extern void GetSettleSim(gpuContext gpu); extern void GetSettleSim(gpuContext gpu);
extern void SetCCMASim(gpuContext gpu); extern void SetCCMASim(gpuContext gpu);
......
...@@ -1757,7 +1757,8 @@ int gpuSetConstants(gpuContext gpu) ...@@ -1757,7 +1757,8 @@ int gpuSetConstants(gpuContext gpu)
SetCalculateObcGbsaForces2Sim(gpu); SetCalculateObcGbsaForces2Sim(gpu);
SetCalculateAndersenThermostatSim(gpu); SetCalculateAndersenThermostatSim(gpu);
SetForcesSim(gpu); SetForcesSim(gpu);
SetUpdateShakeHSim(gpu); SetShakeHSim(gpu);
SetLangevinUpdateSim(gpu);
SetVerletUpdateSim(gpu); SetVerletUpdateSim(gpu);
SetBrownianUpdateSim(gpu); SetBrownianUpdateSim(gpu);
SetSettleSim(gpu); SetSettleSim(gpu);
......
...@@ -35,8 +35,6 @@ using namespace std; ...@@ -35,8 +35,6 @@ using namespace std;
#include "gputypes.h" #include "gputypes.h"
#define DeltaShake
static __constant__ cudaGmxSimulation cSim; static __constant__ cudaGmxSimulation cSim;
void SetBrownianUpdateSim(gpuContext gpu) void SetBrownianUpdateSim(gpuContext gpu)
...@@ -66,15 +64,9 @@ __global__ void kBrownianUpdatePart1_kernel() ...@@ -66,15 +64,9 @@ __global__ void kBrownianUpdatePart1_kernel()
float4 force = cSim.pForce4[pos]; float4 force = cSim.pForce4[pos];
cSim.pOldPosq[pos] = apos; cSim.pOldPosq[pos] = apos;
#ifndef DeltaShake
apos.x += force.x*cSim.GDT + random4a.x;
apos.y += force.y*cSim.GDT + random4a.y;
apos.z += force.z*cSim.GDT + random4a.z;
#else
apos.x = force.x*cSim.GDT + random4a.x; apos.x = force.x*cSim.GDT + random4a.x;
apos.y = force.y*cSim.GDT + random4a.y; apos.y = force.y*cSim.GDT + random4a.y;
apos.z = force.z*cSim.GDT + random4a.z; apos.z = force.z*cSim.GDT + random4a.z;
#endif
cSim.pPosqP[pos] = apos; cSim.pPosqP[pos] = apos;
pos += blockDim.x * gridDim.x; pos += blockDim.x * gridDim.x;
} }
...@@ -99,11 +91,6 @@ __global__ void kBrownianUpdatePart2_kernel() ...@@ -99,11 +91,6 @@ __global__ void kBrownianUpdatePart2_kernel()
float4 apos = cSim.pPosq[pos]; float4 apos = cSim.pPosq[pos];
float4 xPrime = cSim.pPosqP[pos]; float4 xPrime = cSim.pPosqP[pos];
#ifndef DeltaShake
velocity.x = cSim.oneOverDeltaT*(xPrime.x-apos.x);
velocity.y = cSim.oneOverDeltaT*(xPrime.y-apos.y);
velocity.z = cSim.oneOverDeltaT*(xPrime.z-apos.z);
#else
velocity.x = cSim.oneOverDeltaT*(xPrime.x); velocity.x = cSim.oneOverDeltaT*(xPrime.x);
velocity.y = cSim.oneOverDeltaT*(xPrime.y); velocity.y = cSim.oneOverDeltaT*(xPrime.y);
velocity.z = cSim.oneOverDeltaT*(xPrime.z); velocity.z = cSim.oneOverDeltaT*(xPrime.z);
...@@ -111,7 +98,7 @@ __global__ void kBrownianUpdatePart2_kernel() ...@@ -111,7 +98,7 @@ __global__ void kBrownianUpdatePart2_kernel()
xPrime.x += apos.x; xPrime.x += apos.x;
xPrime.y += apos.y; xPrime.y += apos.y;
xPrime.z += apos.z; xPrime.z += apos.z;
#endif
cSim.pPosq[pos] = xPrime; cSim.pPosq[pos] = xPrime;
cSim.pVelm4[pos] = velocity; cSim.pVelm4[pos] = velocity;
......
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
using namespace std;
#include "gputypes.h"
static __constant__ cudaGmxSimulation cSim;
void SetLangevinUpdateSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}
void GetLangevinUpdateSim(gpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}
// Include versions of the kernels with and with center of mass motion removal.
#include "kLangevinUpdate.h"
#define REMOVE_CM
#include "kLangevinUpdate.h"
void kLangevinUpdatePart1(gpuContext gpu)
{
// printf("kLangevinUpdatePart1\n");
if (gpu->bRemoveCM)
{
kLangevinUpdatePart1CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
LAUNCHERROR("kLangevinUpdatePart1CM");
gpu->bRemoveCM = false;
}
else
{
kLangevinUpdatePart1_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kLangevinUpdatePart1");
}
}
extern void kGenerateRandoms(gpuContext gpu);
void kLangevinUpdatePart2(gpuContext gpu)
{
// printf("kLangevinUpdatePart2\n");
if (gpu->bCalculateCM)
{
kLangevinUpdatePart2CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
LAUNCHERROR("kLangevinUpdatePart2CM");
gpu->bCalculateCM = false;
gpu->bRemoveCM = true;
}
else
{
kLangevinUpdatePart2_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kLangevinUpdatePart2");
}
// Update randoms if necessary
gpu->iterations++;
if (gpu->iterations == gpu->sim.randomIterations)
{
kGenerateRandoms(gpu);
gpu->iterations = 0;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernels for Langevin integration. It is included
* several times in kLangevinUpdate.cu with different #defines to generate
* different versions of the kernels.
*/
#ifdef REMOVE_CM
__global__ void kLangevinUpdatePart1CM_kernel()
#else
__global__ void kLangevinUpdatePart1_kernel()
#endif
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int rpos = cSim.pRandomPosition[blockIdx.x];
#ifdef REMOVE_CM
extern __shared__ float3 sCM[];
float3 CM = { 0.0f, 0.0f, 0.0f};
float4 CM1 = { 0.0f, 0.0f, 0.0f, 0.0f };
// Read CM outputs from previous step
unsigned int cpos = threadIdx.x;
while (cpos < gridDim.x)
{
CM1 = cSim.pLinearMomentum[cpos];
CM.x += CM1.x;
CM.y += CM1.y;
CM.z += CM1.z;
cpos += blockDim.x;
}
sCM[threadIdx.x].x = CM.x;
sCM[threadIdx.x].y = CM.y;
sCM[threadIdx.x].z = CM.z;
__syncthreads();
// Reduce CM
unsigned int offset = 1;
unsigned int mask = 1;
while (offset < blockDim.x)
{
if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
{
sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
}
mask = 2 * mask + 1;
offset *= 2;
__syncthreads();
}
#endif
while (pos < cSim.atoms)
{
float4 velocity = cSim.pVelm4[pos];
float4 xVector = cSim.pxVector4[pos];
float4 random4a = cSim.pRandom4a[rpos + pos];
float2 random2a = cSim.pRandom2a[rpos + pos];
float4 apos = cSim.pPosq[pos];
float4 force = cSim.pForce4[pos];
float3 Vmh;
float sqrtInvMass = sqrt(velocity.w);
Vmh.x = xVector.x * cSim.DOverTauC + sqrtInvMass * random4a.x;
Vmh.y = xVector.y * cSim.DOverTauC + sqrtInvMass * random4a.y;
Vmh.z = xVector.z * cSim.DOverTauC + sqrtInvMass * random4a.z;
float4 vVector;
vVector.x = sqrtInvMass * random4a.w;
vVector.y = sqrtInvMass * random2a.x;
vVector.z = sqrtInvMass * random2a.y;
vVector.w = 0.0f;
cSim.pvVector4[pos] = vVector;
velocity.x = velocity.x * cSim.EM +
velocity.w * force.x * cSim.TauOneMinusEM +
vVector.x -
cSim.EM * Vmh.x;
velocity.y = velocity.y * cSim.EM +
velocity.w * force.y * cSim.TauOneMinusEM +
vVector.y -
cSim.EM * Vmh.y;
velocity.z = velocity.z * cSim.EM +
velocity.w * force.z * cSim.TauOneMinusEM +
vVector.z -
cSim.EM * Vmh.z;
#ifdef REMOVE_CM
velocity.x -= sCM[0].x;
velocity.y -= sCM[0].y;
velocity.z -= sCM[0].z;
#endif
cSim.pOldPosq[pos] = apos;
apos.x = velocity.x * cSim.fix1;
apos.y = velocity.y * cSim.fix1;
apos.z = velocity.z * cSim.fix1;
cSim.pPosqP[pos] = apos;
cSim.pVelm4[pos] = velocity;
pos += blockDim.x * gridDim.x;
}
}
#ifdef REMOVE_CM
__global__ void kLangevinUpdatePart2CM_kernel()
#else
__global__ void kLangevinUpdatePart2_kernel()
#endif
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int rpos = cSim.pRandomPosition[blockIdx.x];
#ifdef REMOVE_CM
extern __shared__ float3 sCM[];
float3 CM = {0.0f, 0.0f, 0.0f};
__syncthreads();
#endif
while (pos < cSim.atoms)
{
float4 velocity = cSim.pVelm4[pos];
float4 xPrime = cSim.pPosqP[pos];
float4 vVector = cSim.pvVector4[pos];
float4 xVector;
float4 random4b = cSim.pRandom4b[rpos + pos];
float2 random2b = cSim.pRandom2b[rpos + pos];
float3 Xmh;
float sqrtInvMass = sqrt(velocity.w);
velocity.x = xPrime.x * cSim.oneOverFix1;
velocity.y = xPrime.y * cSim.oneOverFix1;
velocity.z = xPrime.z * cSim.oneOverFix1;
#ifdef REMOVE_CM
float mass = 1.0f / velocity.w;
CM.x += mass * velocity.x;
CM.y += mass * velocity.y;
CM.z += mass * velocity.z;
#endif;
Xmh.x = vVector.x * cSim.TauDOverEMMinusOne +
sqrtInvMass * random4b.x;
Xmh.y = vVector.y * cSim.TauDOverEMMinusOne +
sqrtInvMass * random4b.y;
Xmh.z = vVector.z * cSim.TauDOverEMMinusOne +
sqrtInvMass * random4b.z;
xVector.x = sqrtInvMass * random4b.w;
xVector.y = sqrtInvMass * random2b.x;
xVector.z = sqrtInvMass * random2b.y;
xPrime.x += xVector.x - Xmh.x;
xPrime.y += xVector.y - Xmh.y;
xPrime.z += xVector.z - Xmh.z;
cSim.pPosq[pos] = xPrime;
cSim.pVelm4[pos] = velocity;
cSim.pxVector4[pos] = xVector;
pos += blockDim.x * gridDim.x;
}
// Update random position pointer
if (threadIdx.x == 0)
{
rpos += cSim.paddedNumberOfAtoms;
if (rpos > cSim.randoms)
rpos -= cSim.randoms;
cSim.pRandomPosition[blockIdx.x] = rpos;
}
#ifdef REMOVE_CM
// Scale CM
CM.x *= cSim.inverseTotalMass;
CM.y *= cSim.inverseTotalMass;
CM.z *= cSim.inverseTotalMass;
sCM[threadIdx.x] = CM;
__syncthreads();
// Reduce CM for CTA
unsigned int offset = 1;
unsigned int mask = 1;
while (offset < blockDim.x)
{
if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
{
sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
}
mask = 2 * mask + 1;
offset *= 2;
__syncthreads();
}
if (threadIdx.x == 0)
{
float4 CM;
CM.x = sCM[0].x;
CM.y = sCM[0].y;
CM.z = sCM[0].z;
CM.w = 0.0f;
cSim.pLinearMomentum[blockIdx.x] = CM;
}
#endif
}
...@@ -33,8 +33,6 @@ ...@@ -33,8 +33,6 @@
//#include <fstream> //#include <fstream>
using namespace std; using namespace std;
#define DeltaShake
#include "gputypes.h" #include "gputypes.h"
......
/* -------------------------------------------------------------------------- * /* -------------------------------------------------------------------------- *
* OpenMM * * OpenMM *
* -------------------------------------------------------------------------- * * -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from * * This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of * * Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009 Stanford University and the Authors. * * Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman * * Authors: Scott Le Grand, Peter Eastman *
* Contributors: * * Contributors: *
* * * *
* This program is free software: you can redistribute it and/or modify * * This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published * * it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or * * by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. * * (at your option) any later version. *
* * * *
* This program is distributed in the hope that it will be useful, * * This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of * * but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. * * GNU Lesser General Public License for more details. *
* * * *
* You should have received a copy of the GNU Lesser General Public License * * You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. * * along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#include <stdio.h> #include <stdio.h>
#include <cuda.h> #include <cuda.h>
#include <vector_functions.h> #include <vector_functions.h>
#include <cstdlib> #include <cstdlib>
#include <string> #include <string>
#include <iostream> #include <iostream>
//#include <fstream> using namespace std;
using namespace std;
#include "gputypes.h"
#define DeltaShake
struct Atom
#include "gputypes.h" {
float3 rij1;
struct Atom float3 rij2;
{ float3 rij3;
float3 rij1; float M;
float3 rij2; float d2;
float3 rij3; float InvMassI;
float M; float rij1sq;
float d2; float rij2sq;
float InvMassI; float rij3sq;
float rij1sq; };
float rij2sq;
float rij3sq;
}; static __constant__ cudaGmxSimulation cSim;
void SetShakeHSim(gpuContext gpu)
static __constant__ cudaGmxSimulation cSim; {
cudaError_t status;
void SetUpdateShakeHSim(gpuContext gpu) status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
{ RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
cudaError_t status; }
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed"); void GetShakeHSim(gpuContext gpu)
} {
cudaError_t status;
void GetUpdateShakeHSim(gpuContext gpu) status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
{ RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
cudaError_t status; }
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed"); __global__ void kApplyFirstShake_kernel()
} {
__shared__ Atom sA[G8X_THREADS_PER_BLOCK];
__global__ void kUpdatePart1_kernel() Atom* psA = &sA[threadIdx.x];
{ unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; while (pos < cSim.ShakeConstraints)
unsigned int rpos = cSim.pRandomPosition[blockIdx.x]; {
__syncthreads(); int4 atomID = cSim.pShakeID[pos];
float4 params = cSim.pShakeParameter[pos];
while (pos < cSim.atoms) float4 apos = cSim.pOldPosq[atomID.x];
{ float4 xpi = cSim.pPosqP[atomID.x];
float4 velocity = cSim.pVelm4[pos]; float4 apos1 = cSim.pOldPosq[atomID.y];
float4 xVector = cSim.pxVector4[pos]; float4 xpj1 = cSim.pPosqP[atomID.y];
float4 random4a = cSim.pRandom4a[rpos + pos]; float4 apos2 = {0.0f, 0.0f, 0.0f, 0.0f};
float2 random2a = cSim.pRandom2a[rpos + pos]; float4 xpj2 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 apos = cSim.pPosq[pos]; psA->InvMassI = params.x;
float4 force = cSim.pForce4[pos]; psA->M = params.y;
psA->d2 = params.z;
float3 Vmh; float invMassJ = params.w;
float sqrtInvMass = sqrt(velocity.w); if (atomID.z != -1)
Vmh.x = xVector.x * cSim.DOverTauC + sqrtInvMass * random4a.x; {
Vmh.y = xVector.y * cSim.DOverTauC + sqrtInvMass * random4a.y; apos2 = cSim.pOldPosq[atomID.z];
Vmh.z = xVector.z * cSim.DOverTauC + sqrtInvMass * random4a.z; xpj2 = cSim.pPosqP[atomID.z];
float4 vVector; }
vVector.x = sqrtInvMass * random4a.w; float4 apos3 = {0.0f, 0.0f, 0.0f, 0.0f};
vVector.y = sqrtInvMass * random2a.x; float4 xpj3 = {0.0f, 0.0f, 0.0f, 0.0f};
vVector.z = sqrtInvMass * random2a.y; if (atomID.w != -1)
vVector.w = 0.0f; {
cSim.pvVector4[pos] = vVector; apos3 = cSim.pOldPosq[atomID.w];
velocity.x = velocity.x * cSim.EM + xpj3 = cSim.pPosqP[atomID.w];
velocity.w * force.x * cSim.TauOneMinusEM + }
vVector.x -
cSim.EM * Vmh.x; float3 xi, xj1, xj2, xj3;
velocity.y = velocity.y * cSim.EM + xi.x = apos.x;
velocity.w * force.y * cSim.TauOneMinusEM + xi.y = apos.y;
vVector.y - xi.z = apos.z;
cSim.EM * Vmh.y; xj1.x = apos1.x;
velocity.z = velocity.z * cSim.EM + xj1.y = apos1.y;
velocity.w * force.z * cSim.TauOneMinusEM + xj1.z = apos1.z;
vVector.z - xj2.x = apos2.x;
cSim.EM * Vmh.z; xj2.y = apos2.y;
cSim.pOldPosq[pos] = apos; xj2.z = apos2.z;
#ifndef DeltaShake xj3.x = apos3.x;
apos.x += velocity.x * cSim.fix1; xj3.y = apos3.y;
apos.y += velocity.y * cSim.fix1; xj3.z = apos3.z;
apos.z += velocity.z * cSim.fix1; psA->rij1.x = xi.x - xj1.x;
#else psA->rij1.y = xi.y - xj1.y;
apos.x = velocity.x * cSim.fix1; psA->rij1.z = xi.z - xj1.z;
apos.y = velocity.y * cSim.fix1; psA->rij2.x = xi.x - xj2.x;
apos.z = velocity.z * cSim.fix1; psA->rij2.y = xi.y - xj2.y;
#endif psA->rij2.z = xi.z - xj2.z;
cSim.pPosqP[pos] = apos; psA->rij3.x = xi.x - xj3.x;
cSim.pVelm4[pos] = velocity; psA->rij3.y = xi.y - xj3.y;
pos += blockDim.x * gridDim.x; psA->rij3.z = xi.z - xj3.z;
} psA->rij1sq = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z;
} psA->rij2sq = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z;
psA->rij3sq = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z;
__global__ void kUpdatePart1CM_kernel() float ld1 = psA->d2 - psA->rij1sq;
{ float ld2 = psA->d2 - psA->rij2sq;
extern __shared__ float3 sCM[]; float ld3 = psA->d2 - psA->rij3sq;
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int rpos = cSim.pRandomPosition[blockIdx.x];
float3 CM = { 0.0f, 0.0f, 0.0f}; bool converged = false;
float4 CM1 = { 0.0f, 0.0f, 0.0f, 0.0f }; int iteration = 0;
while (iteration < 15 && !converged)
// Read CM outputs from previous step {
unsigned int cpos = threadIdx.x; converged = true;
#if 0 float3 rpij;
float4 CM2 = { 0.0f, 0.0f, 0.0f, 0.0f }; rpij.x = xpi.x - xpj1.x;
float4 CM3 = { 0.0f, 0.0f, 0.0f, 0.0f }; rpij.y = xpi.y - xpj1.y;
float4 CM4 = { 0.0f, 0.0f, 0.0f, 0.0f }; rpij.z = xpi.z - xpj1.z;
if (cpos < gridDim.x) float rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
CM1 = cSim.pLinearMomentum[cpos]; float rrpr = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z;
cpos += gridDim.x; float diff = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
if (cpos < gridDim.x) if (diff >= 1.0f)
CM2 = cSim.pLinearMomentum[cpos]; {
cpos += gridDim.x; float acor = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq);
if (cpos < gridDim.x) float3 dr;
CM3 = cSim.pLinearMomentum[cpos]; dr.x = psA->rij1.x * acor;
cpos += gridDim.x; dr.y = psA->rij1.y * acor;
if (cpos < gridDim.x) dr.z = psA->rij1.z * acor;
CM4 = cSim.pLinearMomentum[cpos]; xpi.x += dr.x * psA->InvMassI;
sCM[threadIdx.x].x = CM1.x + CM2.x + CM3.x + CM4.x; xpi.y += dr.y * psA->InvMassI;
sCM[threadIdx.x].y = CM1.y + CM2.y + CM3.y + CM4.y; xpi.z += dr.z * psA->InvMassI;
sCM[threadIdx.x].z = CM1.z + CM2.z + CM3.z + CM4.z; xpj1.x -= dr.x * invMassJ;
#else xpj1.y -= dr.y * invMassJ;
while (cpos < gridDim.x) xpj1.z -= dr.z * invMassJ;
{ converged = false;
CM1 = cSim.pLinearMomentum[cpos]; }
CM.x += CM1.x;
CM.y += CM1.y; if (atomID.z != -1)
CM.z += CM1.z; {
cpos += blockDim.x; rpij.x = xpi.x - xpj2.x;
} rpij.y = xpi.y - xpj2.y;
sCM[threadIdx.x].x = CM.x; rpij.z = xpi.z - xpj2.z;
sCM[threadIdx.x].y = CM.y; rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
sCM[threadIdx.x].z = CM.z; rrpr = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z;
#endif diff = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
__syncthreads(); if (diff >= 1.0f)
{
// Reduce CM float acor = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq);
unsigned int offset = 1; float3 dr;
unsigned int mask = 1; dr.x = psA->rij2.x * acor;
while (offset < blockDim.x) dr.y = psA->rij2.y * acor;
{ dr.z = psA->rij2.z * acor;
if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x)) xpi.x += dr.x * psA->InvMassI;
{ xpi.y += dr.y * psA->InvMassI;
sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x; xpi.z += dr.z * psA->InvMassI;
sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y; xpj2.x -= dr.x * invMassJ;
sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z; xpj2.y -= dr.y * invMassJ;
} xpj2.z -= dr.z * invMassJ;
mask = 2 * mask + 1; converged = false;
offset *= 2; }
__syncthreads(); }
}
if (atomID.w != -1)
while (pos < cSim.atoms) {
{ rpij.x = xpi.x - xpj3.x;
float4 velocity = cSim.pVelm4[pos]; rpij.y = xpi.y - xpj3.y;
float4 xVector = cSim.pxVector4[pos]; rpij.z = xpi.z - xpj3.z;
float4 random4a = cSim.pRandom4a[rpos + pos]; rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
float2 random2a = cSim.pRandom2a[rpos + pos]; rrpr = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z;
float4 apos = cSim.pPosq[pos]; diff = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
float4 force = cSim.pForce4[pos]; if (diff >= 1.0f)
{
float3 Vmh; float acor = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq);
float sqrtInvMass = sqrt(velocity.w); float3 dr;
Vmh.x = xVector.x * cSim.DOverTauC + sqrtInvMass * random4a.x; dr.x = psA->rij3.x * acor;
Vmh.y = xVector.y * cSim.DOverTauC + sqrtInvMass * random4a.y; dr.y = psA->rij3.y * acor;
Vmh.z = xVector.z * cSim.DOverTauC + sqrtInvMass * random4a.z; dr.z = psA->rij3.z * acor;
float4 vVector; xpi.x += dr.x * psA->InvMassI;
vVector.x = sqrtInvMass * random4a.w; xpi.y += dr.y * psA->InvMassI;
vVector.y = sqrtInvMass * random2a.x; xpi.z += dr.z * psA->InvMassI;
vVector.z = sqrtInvMass * random2a.y; xpj3.x -= dr.x * invMassJ;
vVector.w = 0.0f; xpj3.y -= dr.y * invMassJ;
cSim.pvVector4[pos] = vVector; xpj3.z -= dr.z * invMassJ;
velocity.x = velocity.x * cSim.EM + converged = false;
velocity.w * force.x * cSim.TauOneMinusEM + }
vVector.x - }
cSim.EM * Vmh.x - iteration++;
sCM[0].x; }
velocity.y = velocity.y * cSim.EM + cSim.pPosqP[atomID.x] = xpi;
velocity.w * force.y * cSim.TauOneMinusEM + cSim.pPosqP[atomID.y] = xpj1;
vVector.y - if (atomID.z != -1)
cSim.EM * Vmh.y - cSim.pPosqP[atomID.z] = xpj2;
sCM[0].y; if (atomID.w != -1)
velocity.z = velocity.z * cSim.EM + cSim.pPosqP[atomID.w] = xpj3;
velocity.w * force.z * cSim.TauOneMinusEM +
vVector.z - pos += blockDim.x * gridDim.x;
cSim.EM * Vmh.z - }
sCM[0].z; }
cSim.pOldPosq[pos] = apos;
#ifndef DeltaShake void kApplyFirstShake(gpuContext gpu)
apos.x += velocity.x * cSim.fix1; {
apos.y += velocity.y * cSim.fix1; // printf("kApplyFirstShake\n");
apos.z += velocity.z * cSim.fix1; if (gpu->sim.ShakeConstraints > 0)
#else {
apos.x = velocity.x * cSim.fix1; kApplyFirstShake_kernel<<<gpu->sim.blocks, gpu->sim.shake_threads_per_block>>>();
apos.y = velocity.y * cSim.fix1; LAUNCHERROR("kApplyFirstShake");
apos.z = velocity.z * cSim.fix1; }
#endif }
cSim.pPosqP[pos] = apos;
cSim.pVelm4[pos] = velocity; __global__ void kApplySecondShake_kernel()
pos += blockDim.x * gridDim.x; {
} __shared__ Atom sA[G8X_THREADS_PER_BLOCK];
} Atom* psA = &sA[threadIdx.x];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.ShakeConstraints)
{
void kUpdatePart1(gpuContext gpu) int4 atomID = cSim.pShakeID[pos];
{ float4 params = cSim.pShakeParameter[pos];
// printf("kUpdatePart1\n"); float4 apos = cSim.pOldPosq[atomID.x];
#if 0 float4 xpi = cSim.pPosq[atomID.x];
static int iteration = 0; float4 apos1 = cSim.pOldPosq[atomID.y];
if (iteration == 0) float4 xpj1 = cSim.pPosq[atomID.y];
{ float4 apos2 = {0.0f, 0.0f, 0.0f, 0.0f};
gpu->psPosq4->Download(); float4 xpj2 = {0.0f, 0.0f, 0.0f, 0.0f};
gpu->psVelm4->Download(); psA->InvMassI = params.x;
printf("# %d atoms\n", gpu->natoms); psA->M = params.y;
for (int i = 0; i < gpu->natoms; i++) psA->d2 = params.z;
{ float invMassJ = params.w;
printf("%5d %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i, if (atomID.z != -1)
gpu->psPosq4->_pSysStream[0][i].x, gpu->psPosq4->_pSysStream[0][i].y, {
gpu->psPosq4->_pSysStream[0][i].z, gpu->psPosq4->_pSysStream[0][i].w, apos2 = cSim.pOldPosq[atomID.z];
gpu->psVelm4->_pSysStream[0][i].x, gpu->psVelm4->_pSysStream[0][i].y, xpj2 = cSim.pPosq[atomID.z];
gpu->psVelm4->_pSysStream[0][i].z, gpu->psVelm4->_pSysStream[0][i].w }
); float4 apos3 = {0.0f, 0.0f, 0.0f, 0.0f};
} float4 xpj3 = {0.0f, 0.0f, 0.0f, 0.0f};
} if (atomID.w != -1)
iteration++; {
#endif apos3 = cSim.pOldPosq[atomID.w];
#if 0 xpj3 = cSim.pPosq[atomID.w];
static const float KILO = 1e3; // Thousand }
static const float BOLTZMANN = 1.380658e-23f; // (J/K)
static const float AVOGADRO = 6.0221367e23f; // () float3 xi, xj1, xj2, xj3;
static const float RGAS = BOLTZMANN * AVOGADRO; // (J/(mol K)) xi.x = apos.x;
static const float BOLTZ = (RGAS / KILO); // (kJ/(mol K)) xi.y = apos.y;
static int iteration = 0; xi.z = apos.z;
xj1.x = apos1.x;
// Check T xj1.y = apos1.y;
if (iteration % 1000 == 0) xj1.z = apos1.z;
{ xj2.x = apos2.x;
gpu->psVelm4->Download(); xj2.y = apos2.y;
float ke = 0.0f; xj2.z = apos2.z;
for (int i = 0; i < gpu->natoms; i++) xj3.x = apos3.x;
{ xj3.y = apos3.y;
float vx = gpu->psVelm4->_pSysStream[0][i].x; xj3.z = apos3.z;
float vy = gpu->psVelm4->_pSysStream[0][i].y; psA->rij1.x = xi.x - xj1.x;
float vz = gpu->psVelm4->_pSysStream[0][i].z; psA->rij1.y = xi.y - xj1.y;
float m = 1.0f / gpu->psVelm4->_pSysStream[0][i].w; psA->rij1.z = xi.z - xj1.z;
ke += m * (vx * vx + vy * vy + vz * vz); psA->rij2.x = xi.x - xj2.x;
} psA->rij2.y = xi.y - xj2.y;
float T = ke / (BOLTZ * gpu->sim.degreesOfFreedom); psA->rij2.z = xi.z - xj2.z;
printf("Iteration %d, Temperature is %f\n", iteration, T); psA->rij3.x = xi.x - xj3.x;
} psA->rij3.y = xi.y - xj3.y;
iteration++; psA->rij3.z = xi.z - xj3.z;
#endif psA->rij1sq = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z;
if (gpu->bRemoveCM) psA->rij2sq = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z;
{ psA->rij3sq = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z;
kUpdatePart1CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>(); float ld1 = psA->d2 - psA->rij1sq;
LAUNCHERROR("kUpdatePart1CM"); float ld2 = psA->d2 - psA->rij2sq;
gpu->bRemoveCM = false; float ld3 = psA->d2 - psA->rij3sq;
#if 0
gpu->psLinearMomentum->Download(); bool converged = false;
gpu->psVelm4->Download(); int iteration = 0;
float3 mv = {0.0f, 0.0f, 0.0f}; while (iteration < 15 && !converged)
for (int i = 0; i < gpu->natoms; i++) {
{ converged = true;
float mass = 1.0f / gpu->psVelm4->_pSysStream[0][i].w; float3 rpij;
mv.x += mass * gpu->psVelm4->_pSysStream[0][i].x; rpij.x = xpi.x - xpj1.x;
mv.y += mass * gpu->psVelm4->_pSysStream[0][i].y; rpij.y = xpi.y - xpj1.y;
mv.z += mass * gpu->psVelm4->_pSysStream[0][i].z; rpij.z = xpi.z - xpj1.z;
} float rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
mv.x *= gpu->sim.inverseTotalMass; float rrpr = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z;
mv.y *= gpu->sim.inverseTotalMass; float diff = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
mv.z *= gpu->sim.inverseTotalMass; if (diff >= 1.0f)
{
float3 mv1 = {0.0f, 0.0f, 0.0f}; float acor = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq);
for (int i = 0; i < gpu->sim.blocks; i++) float3 dr;
{ dr.x = psA->rij1.x * acor;
mv1.x += gpu->psLinearMomentum->_pSysStream[0][i].x; dr.y = psA->rij1.y * acor;
mv1.y += gpu->psLinearMomentum->_pSysStream[0][i].y; dr.z = psA->rij1.z * acor;
mv1.z += gpu->psLinearMomentum->_pSysStream[0][i].z; xpi.x += dr.x * psA->InvMassI;
} xpi.y += dr.y * psA->InvMassI;
printf("%11.5f %11.5f %11.5f | %11.5f %11.5f %11.5f\n", mv.x, mv.y, mv.z, mv1.x, mv1.y, mv1.z); xpi.z += dr.z * psA->InvMassI;
#endif xpj1.x -= dr.x * invMassJ;
} xpj1.y -= dr.y * invMassJ;
else xpj1.z -= dr.z * invMassJ;
{ converged = false;
kUpdatePart1_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(); }
LAUNCHERROR("kUpdatePart1");
} if (atomID.z != -1)
} {
rpij.x = xpi.x - xpj2.x;
__global__ void kApplyFirstShake_kernel() rpij.y = xpi.y - xpj2.y;
{ rpij.z = xpi.z - xpj2.z;
__shared__ Atom sA[G8X_THREADS_PER_BLOCK]; rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
Atom* psA = &sA[threadIdx.x]; rrpr = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z;
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; diff = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
while (pos < cSim.ShakeConstraints) if (diff >= 1.0f)
{ {
int4 atomID = cSim.pShakeID[pos]; float acor = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq);
float4 params = cSim.pShakeParameter[pos]; float3 dr;
float4 apos = cSim.pOldPosq[atomID.x]; dr.x = psA->rij2.x * acor;
float4 xpi = cSim.pPosqP[atomID.x]; dr.y = psA->rij2.y * acor;
float4 apos1 = cSim.pOldPosq[atomID.y]; dr.z = psA->rij2.z * acor;
float4 xpj1 = cSim.pPosqP[atomID.y]; xpi.x += dr.x * psA->InvMassI;
float4 apos2 = {0.0f, 0.0f, 0.0f, 0.0f}; xpi.y += dr.y * psA->InvMassI;
float4 xpj2 = {0.0f, 0.0f, 0.0f, 0.0f}; xpi.z += dr.z * psA->InvMassI;
psA->InvMassI = params.x; xpj2.x -= dr.x * invMassJ;
psA->M = params.y; xpj2.y -= dr.y * invMassJ;
psA->d2 = params.z; xpj2.z -= dr.z * invMassJ;
float invMassJ = params.w; converged = false;
if (atomID.z != -1) }
{ }
apos2 = cSim.pOldPosq[atomID.z];
xpj2 = cSim.pPosqP[atomID.z]; if (atomID.w != -1)
} {
float4 apos3 = {0.0f, 0.0f, 0.0f, 0.0f}; rpij.x = xpi.x - xpj3.x;
float4 xpj3 = {0.0f, 0.0f, 0.0f, 0.0f}; rpij.y = xpi.y - xpj3.y;
if (atomID.w != -1) rpij.z = xpi.z - xpj3.z;
{ rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
apos3 = cSim.pOldPosq[atomID.w]; rrpr = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z;
xpj3 = cSim.pPosqP[atomID.w]; diff = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
} if (diff >= 1.0f)
{
float3 xi, xj1, xj2, xj3; float acor = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq);
xi.x = apos.x; float3 dr;
xi.y = apos.y; dr.x = psA->rij3.x * acor;
xi.z = apos.z; dr.y = psA->rij3.y * acor;
xj1.x = apos1.x; dr.z = psA->rij3.z * acor;
xj1.y = apos1.y; xpi.x += dr.x * psA->InvMassI;
xj1.z = apos1.z; xpi.y += dr.y * psA->InvMassI;
xj2.x = apos2.x; xpi.z += dr.z * psA->InvMassI;
xj2.y = apos2.y; xpj3.x -= dr.x * invMassJ;
xj2.z = apos2.z; xpj3.y -= dr.y * invMassJ;
xj3.x = apos3.x; xpj3.z -= dr.z * invMassJ;
xj3.y = apos3.y; converged = false;
xj3.z = apos3.z; }
#ifndef DeltaShake }
xpi.x -= xi.x; iteration++;
xpi.y -= xi.y; }
xpi.z -= xi.z;
xpj1.x -= xj1.x; xpi.x += xi.x;
xpj1.y -= xj1.y; xpi.y += xi.y;
xpj1.z -= xj1.z; xpi.z += xi.z;
xpj2.x -= xj2.x; xpj1.x += xj1.x;
xpj2.y -= xj2.y; xpj1.y += xj1.y;
xpj2.z -= xj2.z; xpj1.z += xj1.z;
xpj3.x -= xj3.x; xpj2.x += xj2.x;
xpj3.y -= xj3.y; xpj2.y += xj2.y;
xpj3.z -= xj3.z; xpj2.z += xj2.z;
#endif xpj3.x += xj3.x;
psA->rij1.x = xi.x - xj1.x; xpj3.y += xj3.y;
psA->rij1.y = xi.y - xj1.y; xpj3.z += xj3.z;
psA->rij1.z = xi.z - xj1.z;
psA->rij2.x = xi.x - xj2.x; cSim.pPosq[atomID.x] = xpi;
psA->rij2.y = xi.y - xj2.y; cSim.pPosq[atomID.y] = xpj1;
psA->rij2.z = xi.z - xj2.z;
psA->rij3.x = xi.x - xj3.x; if (atomID.z != -1)
psA->rij3.y = xi.y - xj3.y; cSim.pPosq[atomID.z] = xpj2;
psA->rij3.z = xi.z - xj3.z;
psA->rij1sq = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z; if (atomID.w != -1)
psA->rij2sq = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z; cSim.pPosq[atomID.w] = xpj3;
psA->rij3sq = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z;
float ld1 = psA->d2 - psA->rij1sq; pos += blockDim.x * gridDim.x;
float ld2 = psA->d2 - psA->rij2sq; }
float ld3 = psA->d2 - psA->rij3sq; }
__global__ void kApplyNoShake_kernel()
bool converged = false; {
int iteration = 0; unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (iteration < 15 && !converged) while (pos < cSim.NonShakeConstraints)
{ {
converged = true; int atomID = cSim.pNonShakeID[pos];
float3 rpij; float4 apos = cSim.pOldPosq[atomID];
rpij.x = xpi.x - xpj1.x; float4 xpi = cSim.pPosq[atomID];
rpij.y = xpi.y - xpj1.y; xpi.x += apos.x;
rpij.z = xpi.z - xpj1.z; xpi.y += apos.y;
float rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z; xpi.z += apos.z;
float rrpr = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z; cSim.pPosq[atomID] = xpi;
float diff = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
if (diff >= 1.0f) pos += blockDim.x * gridDim.x;
{ }
float acor = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq); }
float3 dr;
dr.x = psA->rij1.x * acor;
dr.y = psA->rij1.y * acor; void kApplySecondShake(gpuContext gpu)
dr.z = psA->rij1.z * acor; {
xpi.x += dr.x * psA->InvMassI; // printf("kApplySecondShake\n");
xpi.y += dr.y * psA->InvMassI; if (gpu->sim.ShakeConstraints > 0)
xpi.z += dr.z * psA->InvMassI; {
xpj1.x -= dr.x * invMassJ; kApplySecondShake_kernel<<<gpu->sim.blocks, gpu->sim.shake_threads_per_block>>>();
xpj1.y -= dr.y * invMassJ; LAUNCHERROR("kApplySecondShake");
xpj1.z -= dr.z * invMassJ; }
converged = false;
} // handle non-Shake atoms
if (atomID.z != -1) if (gpu->sim.NonShakeConstraints > 0)
{ {
rpij.x = xpi.x - xpj2.x; kApplyNoShake_kernel<<<gpu->sim.blocks, gpu->sim.nonshake_threads_per_block>>>();
rpij.y = xpi.y - xpj2.y; LAUNCHERROR("kApplyNoShake");
rpij.z = xpi.z - xpj2.z; }
rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z; }
rrpr = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z;
diff = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
if (diff >= 1.0f)
{
float acor = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq);
float3 dr;
dr.x = psA->rij2.x * acor;
dr.y = psA->rij2.y * acor;
dr.z = psA->rij2.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj2.x -= dr.x * invMassJ;
xpj2.y -= dr.y * invMassJ;
xpj2.z -= dr.z * invMassJ;
converged = false;
}
}
if (atomID.w != -1)
{
rpij.x = xpi.x - xpj3.x;
rpij.y = xpi.y - xpj3.y;
rpij.z = xpi.z - xpj3.z;
rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
rrpr = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z;
diff = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance);
if (diff >= 1.0f)
{
float acor = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq);
float3 dr;
dr.x = psA->rij3.x * acor;
dr.y = psA->rij3.y * acor;
dr.z = psA->rij3.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj3.x -= dr.x * invMassJ;
xpj3.y -= dr.y * invMassJ;
xpj3.z -= dr.z * invMassJ;
converged = false;
}
}
iteration++;
}
#ifndef DeltaShake
xpi.x += xi.x;
xpi.y += xi.y;
xpi.z += xi.z;
xpj1.x += xj1.x;
xpj1.y += xj1.y;
xpj1.z += xj1.z;
xpj2.x += xj2.x;
xpj2.y += xj2.y;
xpj2.z += xj2.z;
xpj3.x += xj3.x;
xpj3.y += xj3.y;
xpj3.z += xj3.z;
#endif
cSim.pPosqP[atomID.x] = xpi;
cSim.pPosqP[atomID.y] = xpj1;
if (atomID.z != -1)
cSim.pPosqP[atomID.z] = xpj2;
if (atomID.w != -1)
cSim.pPosqP[atomID.w] = xpj3;
pos += blockDim.x * gridDim.x;
}
}
void kApplyFirstShake(gpuContext gpu)
{
// printf("kApplyFirstShake\n");
if (gpu->sim.ShakeConstraints > 0)
{
kApplyFirstShake_kernel<<<gpu->sim.blocks, gpu->sim.shake_threads_per_block>>>();
LAUNCHERROR("kApplyFirstShake");
}
}
__global__ void kUpdatePart2_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int rpos = cSim.pRandomPosition[blockIdx.x];
__syncthreads();
while (pos < cSim.atoms)
{
float4 velocity = cSim.pVelm4[pos];
#ifndef DeltaShake
float4 apos = cSim.pPosq[pos];
#endif
float4 xPrime = cSim.pPosqP[pos];
float4 vVector = cSim.pvVector4[pos];
float4 xVector;
float4 random4b = cSim.pRandom4b[rpos + pos];
float2 random2b = cSim.pRandom2b[rpos + pos];
float3 Xmh;
float sqrtInvMass = sqrt(velocity.w);
#ifdef DeltaShake
velocity.x = xPrime.x * cSim.oneOverFix1;
velocity.y = xPrime.y * cSim.oneOverFix1;
velocity.z = xPrime.z * cSim.oneOverFix1;
#else
velocity.x = (xPrime.x - apos.x) * cSim.oneOverFix1;
velocity.y = (xPrime.y - apos.y) * cSim.oneOverFix1;
velocity.z = (xPrime.z - apos.z) * cSim.oneOverFix1;
#endif
Xmh.x = vVector.x * cSim.TauDOverEMMinusOne +
sqrtInvMass * random4b.x;
Xmh.y = vVector.y * cSim.TauDOverEMMinusOne +
sqrtInvMass * random4b.y;
Xmh.z = vVector.z * cSim.TauDOverEMMinusOne +
sqrtInvMass * random4b.z;
xVector.x = sqrtInvMass * random4b.w;
xVector.y = sqrtInvMass * random2b.x;
xVector.z = sqrtInvMass * random2b.y;
xPrime.x += xVector.x - Xmh.x;
xPrime.y += xVector.y - Xmh.y;
xPrime.z += xVector.z - Xmh.z;
cSim.pPosq[pos] = xPrime;
cSim.pVelm4[pos] = velocity;
cSim.pxVector4[pos] = xVector;
pos += blockDim.x * gridDim.x;
}
// Update random position pointer
if (threadIdx.x == 0)
{
rpos += cSim.paddedNumberOfAtoms;
if (rpos > cSim.randoms)
rpos -= cSim.randoms;
cSim.pRandomPosition[blockIdx.x] = rpos;
}
}
__global__ void kUpdatePart2CM_kernel()
{
extern __shared__ float3 sCM[];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int rpos = cSim.pRandomPosition[blockIdx.x];
float3 CM = {0.0f, 0.0f, 0.0f};
__syncthreads();
while (pos < cSim.atoms)
{
float4 velocity = cSim.pVelm4[pos];
#ifndef DeltaShake
float4 apos = cSim.pPosq[pos];
#endif
float4 xPrime = cSim.pPosqP[pos];
float4 vVector = cSim.pvVector4[pos];
float4 xVector;
float4 random4b = cSim.pRandom4b[rpos + pos];
float2 random2b = cSim.pRandom2b[rpos + pos];
float3 Xmh;
float mass = 1.0f / velocity.w;
float sqrtInvMass = sqrt(velocity.w);
#ifdef DeltaShake
velocity.x = xPrime.x * cSim.oneOverFix1;
velocity.y = xPrime.y * cSim.oneOverFix1;
velocity.z = xPrime.z * cSim.oneOverFix1;
#else
velocity.x = (xPrime.x - apos.x) * cSim.oneOverFix1;
velocity.y = (xPrime.y - apos.y) * cSim.oneOverFix1;
velocity.z = (xPrime.z - apos.z) * cSim.oneOverFix1;
#endif
CM.x += mass * velocity.x;
CM.y += mass * velocity.y;
CM.z += mass * velocity.z;
Xmh.x = vVector.x * cSim.TauDOverEMMinusOne +
sqrtInvMass * random4b.x;
Xmh.y = vVector.y * cSim.TauDOverEMMinusOne +
sqrtInvMass * random4b.y;
Xmh.z = vVector.z * cSim.TauDOverEMMinusOne +
sqrtInvMass * random4b.z;
xVector.x = sqrtInvMass * random4b.w;
xVector.y = sqrtInvMass * random2b.x;
xVector.z = sqrtInvMass * random2b.y;
xPrime.x += xVector.x - Xmh.x;
xPrime.y += xVector.y - Xmh.y;
xPrime.z += xVector.z - Xmh.z;
cSim.pPosq[pos] = xPrime;
cSim.pVelm4[pos] = velocity;
cSim.pxVector4[pos] = xVector;
pos += blockDim.x * gridDim.x;
}
// Update random position pointer
if (threadIdx.x == 0)
{
rpos += cSim.paddedNumberOfAtoms;
if (rpos > cSim.randoms)
rpos -= cSim.randoms;
cSim.pRandomPosition[blockIdx.x] = rpos;
}
// Scale CM
CM.x *= cSim.inverseTotalMass;
CM.y *= cSim.inverseTotalMass;
CM.z *= cSim.inverseTotalMass;
sCM[threadIdx.x] = CM;
__syncthreads();
// Reduce CM for CTA
unsigned int offset = 1;
unsigned int mask = 1;
while (offset < blockDim.x)
{
if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
{
sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
}
mask = 2 * mask + 1;
offset *= 2;
__syncthreads();
}
if (threadIdx.x == 0)
{
float4 CM;
CM.x = sCM[0].x;
CM.y = sCM[0].y;
CM.z = sCM[0].z;
CM.w = 0.0f;
cSim.pLinearMomentum[blockIdx.x] = CM;
}
}
extern void kGenerateRandoms(gpuContext gpu);
void kUpdatePart2(gpuContext gpu)
{
// printf("kUpdatePart2\n");
if (gpu->bCalculateCM)
{
kUpdatePart2CM_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>();
LAUNCHERROR("kUpdatePart2CM");
gpu->bCalculateCM = false;
gpu->bRemoveCM = true;
#if 0
gpu->psLinearMomentum->Download();
gpu->psVelm4->Download();
float3 mv = {0.0f, 0.0f, 0.0f};
for (int i = 0; i < gpu->natoms; i++)
{
float mass = 1.0f / gpu->psVelm4->_pSysStream[0][i].w;
mv.x += mass * gpu->psVelm4->_pSysStream[0][i].x;
mv.y += mass * gpu->psVelm4->_pSysStream[0][i].y;
mv.z += mass * gpu->psVelm4->_pSysStream[0][i].z;
}
mv.x *= gpu->sim.inverseTotalMass;
mv.y *= gpu->sim.inverseTotalMass;
mv.z *= gpu->sim.inverseTotalMass;
float3 mv1 = {0.0f, 0.0f, 0.0f};
for (int i = 0; i < gpu->sim.blocks; i++)
{
mv1.x += gpu->psLinearMomentum->_pSysStream[0][i].x;
mv1.y += gpu->psLinearMomentum->_pSysStream[0][i].y;
mv1.z += gpu->psLinearMomentum->_pSysStream[0][i].z;
}
printf("%11.5f %11.5f %11.5f | %11.5f %11.5f %11.5f\n", mv.x, mv.y, mv.z, mv1.x, mv1.y, mv1.z);
#endif
}
else
{
kUpdatePart2_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>();
LAUNCHERROR("kUpdatePart2");
}
// Update randoms if necessary
gpu->iterations++;
if (gpu->iterations == gpu->sim.randomIterations)
{
kGenerateRandoms(gpu);
gpu->iterations = 0;
}
}
__global__ void kApplySecondShake_kernel()
{
__shared__ Atom sA[G8X_THREADS_PER_BLOCK];
Atom* psA = &sA[threadIdx.x];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.ShakeConstraints)
{
int4 atomID = cSim.pShakeID[pos];
float4 params = cSim.pShakeParameter[pos];
float4 apos = cSim.pOldPosq[atomID.x];
float4 xpi = cSim.pPosq[atomID.x];
float4 apos1 = cSim.pOldPosq[atomID.y];
float4 xpj1 = cSim.pPosq[atomID.y];
float4 apos2 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 xpj2 = {0.0f, 0.0f, 0.0f, 0.0f};
psA->InvMassI = params.x;
psA->M = params.y;
psA->d2 = params.z;
float invMassJ = params.w;
if (atomID.z != -1)
{
apos2 = cSim.pOldPosq[atomID.z];
xpj2 = cSim.pPosq[atomID.z];
}
float4 apos3 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 xpj3 = {0.0f, 0.0f, 0.0f, 0.0f};
if (atomID.w != -1)
{
apos3 = cSim.pOldPosq[atomID.w];
xpj3 = cSim.pPosq[atomID.w];
}
float3 xi, xj1, xj2, xj3;
xi.x = apos.x;
xi.y = apos.y;
xi.z = apos.z;
xj1.x = apos1.x;
xj1.y = apos1.y;
xj1.z = apos1.z;
xj2.x = apos2.x;
xj2.y = apos2.y;
xj2.z = apos2.z;
xj3.x = apos3.x;
xj3.y = apos3.y;
xj3.z = apos3.z;
#ifndef DeltaShake
xpi.x -= xi.x;
xpi.y -= xi.y;
xpi.z -= xi.z;
xpj1.x -= xj1.x;
xpj1.y -= xj1.y;
xpj1.z -= xj1.z;
xpj2.x -= xj2.x;
xpj2.y -= xj2.y;
xpj2.z -= xj2.z;
xpj3.x -= xj3.x;
xpj3.y -= xj3.y;
xpj3.z -= xj3.z;
#endif
psA->rij1.x = xi.x - xj1.x;
psA->rij1.y = xi.y - xj1.y;
psA->rij1.z = xi.z - xj1.z;
psA->rij2.x = xi.x - xj2.x;
psA->rij2.y = xi.y - xj2.y;
psA->rij2.z = xi.z - xj2.z;
psA->rij3.x = xi.x - xj3.x;
psA->rij3.y = xi.y - xj3.y;
psA->rij3.z = xi.z - xj3.z;
psA->rij1sq = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z;
psA->rij2sq = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z;
psA->rij3sq = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z;
float ld1 = psA->d2 - psA->rij1sq;
float ld2 = psA->d2 - psA->rij2sq;
float ld3 = psA->d2 - psA->rij3sq;
bool converged = false;
int iteration = 0;
while (iteration < 15 && !converged)
{
converged = true;
float3 rpij;
rpij.x = xpi.x - xpj1.x;
rpij.y = xpi.y - xpj1.y;
rpij.z = xpi.z - xpj1.z;
float rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
float rrpr = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z;
float diff = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
if (diff >= 1.0f)
{
float acor = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq);
float3 dr;
dr.x = psA->rij1.x * acor;
dr.y = psA->rij1.y * acor;
dr.z = psA->rij1.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj1.x -= dr.x * invMassJ;
xpj1.y -= dr.y * invMassJ;
xpj1.z -= dr.z * invMassJ;
converged = false;
}
if (atomID.z != -1)
{
rpij.x = xpi.x - xpj2.x;
rpij.y = xpi.y - xpj2.y;
rpij.z = xpi.z - xpj2.z;
rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
rrpr = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z;
diff = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
if (diff >= 1.0f)
{
float acor = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq);
float3 dr;
dr.x = psA->rij2.x * acor;
dr.y = psA->rij2.y * acor;
dr.z = psA->rij2.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj2.x -= dr.x * invMassJ;
xpj2.y -= dr.y * invMassJ;
xpj2.z -= dr.z * invMassJ;
converged = false;
}
}
if (atomID.w != -1)
{
rpij.x = xpi.x - xpj3.x;
rpij.y = xpi.y - xpj3.y;
rpij.z = xpi.z - xpj3.z;
rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
rrpr = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z;
diff = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
if (diff >= 1.0f)
{
float acor = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq);
float3 dr;
dr.x = psA->rij3.x * acor;
dr.y = psA->rij3.y * acor;
dr.z = psA->rij3.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj3.x -= dr.x * invMassJ;
xpj3.y -= dr.y * invMassJ;
xpj3.z -= dr.z * invMassJ;
converged = false;
}
}
iteration++;
}
xpi.x += xi.x;
xpi.y += xi.y;
xpi.z += xi.z;
xpj1.x += xj1.x;
xpj1.y += xj1.y;
xpj1.z += xj1.z;
xpj2.x += xj2.x;
xpj2.y += xj2.y;
xpj2.z += xj2.z;
xpj3.x += xj3.x;
xpj3.y += xj3.y;
xpj3.z += xj3.z;
cSim.pPosq[atomID.x] = xpi;
cSim.pPosq[atomID.y] = xpj1;
if (atomID.z != -1)
cSim.pPosq[atomID.z] = xpj2;
if (atomID.w != -1)
cSim.pPosq[atomID.w] = xpj3;
pos += blockDim.x * gridDim.x;
}
}
__global__ void kApplyNoShake_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.NonShakeConstraints)
{
int atomID = cSim.pNonShakeID[pos];
float4 apos = cSim.pOldPosq[atomID];
float4 xpi = cSim.pPosq[atomID];
xpi.x += apos.x;
xpi.y += apos.y;
xpi.z += apos.z;
cSim.pPosq[atomID] = xpi;
pos += blockDim.x * gridDim.x;
}
}
void kCPUShake2(gpuContext gpu)
{
}
void kApplySecondShake(gpuContext gpu)
{
// printf("kApplySecondShake\n");
// kCPUShake2(gpu);
if (gpu->sim.ShakeConstraints > 0)
{
kApplySecondShake_kernel<<<gpu->sim.blocks, gpu->sim.shake_threads_per_block>>>();
LAUNCHERROR("kApplySecondShake");
}
// handle non-Shake atoms
#ifdef DeltaShake
if (gpu->sim.NonShakeConstraints > 0)
{
//fprintf( gpu->log, "kApplyNoShake_kernel %d %d \n", gpu->sim.blocks, gpu->sim.nonshake_threads_per_block); fflush( gpu->log );
kApplyNoShake_kernel<<<gpu->sim.blocks, gpu->sim.nonshake_threads_per_block>>>();
LAUNCHERROR("kApplyNoShake");
}
#endif
}
...@@ -35,8 +35,6 @@ using namespace std; ...@@ -35,8 +35,6 @@ using namespace std;
#include "gputypes.h" #include "gputypes.h"
#define DeltaShake
static __constant__ cudaGmxSimulation cSim; static __constant__ cudaGmxSimulation cSim;
void SetVerletUpdateSim(gpuContext gpu) void SetVerletUpdateSim(gpuContext gpu)
...@@ -53,102 +51,11 @@ void GetVerletUpdateSim(gpuContext gpu) ...@@ -53,102 +51,11 @@ void GetVerletUpdateSim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed"); RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
} }
__global__ void kVerletUpdatePart1_kernel() // Include versions of the kernels with and with center of mass motion removal.
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.atoms)
{
float4 apos = cSim.pPosq[pos];
float4 velocity = cSim.pVelm4[pos];
float4 force = cSim.pForce4[pos];
float dtOverMass = cSim.deltaT*velocity.w;
cSim.pOldPosq[pos] = apos;
velocity.x += dtOverMass*force.x;
velocity.y += dtOverMass*force.y;
velocity.z += dtOverMass*force.z;
#ifndef DeltaShake
apos.x += velocity.x*cSim.deltaT;
apos.y += velocity.y*cSim.deltaT;
apos.z += velocity.z*cSim.deltaT;
#else
apos.x = velocity.x*cSim.deltaT;
apos.y = velocity.y*cSim.deltaT;
apos.z = velocity.z*cSim.deltaT;
#endif
cSim.pPosqP[pos] = apos;
cSim.pVelm4[pos] = velocity;
pos += blockDim.x * gridDim.x;
}
}
__global__ void kVerletUpdatePart1CM_kernel()
{
extern __shared__ float3 sCM[];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
float3 CM = { 0.0f, 0.0f, 0.0f};
float4 CM1 = { 0.0f, 0.0f, 0.0f, 0.0f };
// Read CM outputs from previous step
unsigned int cpos = threadIdx.x;
while (cpos < gridDim.x)
{
CM1 = cSim.pLinearMomentum[cpos];
CM.x += CM1.x;
CM.y += CM1.y;
CM.z += CM1.z;
cpos += blockDim.x;
}
sCM[threadIdx.x].x = CM.x;
sCM[threadIdx.x].y = CM.y;
sCM[threadIdx.x].z = CM.z;
__syncthreads();
// Reduce CM
unsigned int offset = 1;
unsigned int mask = 1;
while (offset < blockDim.x)
{
if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
{
sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
}
mask = 2 * mask + 1;
offset *= 2;
__syncthreads();
}
while (pos < cSim.atoms)
{
float4 apos = cSim.pPosq[pos];
float4 velocity = cSim.pVelm4[pos];
float4 force = cSim.pForce4[pos];
float dtOverMass = cSim.deltaT*velocity.w;
cSim.pOldPosq[pos] = apos;
velocity.x += dtOverMass*force.x-sCM[0].x;
velocity.y += dtOverMass*force.y-sCM[0].y;
velocity.z += dtOverMass*force.z-sCM[0].z;
#ifndef DeltaShake #include "kVerletUpdate.h"
apos.x += velocity.x*cSim.deltaT; #define REMOVE_CM
apos.y += velocity.y*cSim.deltaT; #include "kVerletUpdate.h"
apos.z += velocity.z*cSim.deltaT;
#else
apos.x = velocity.x*cSim.deltaT;
apos.y = velocity.y*cSim.deltaT;
apos.z = velocity.z*cSim.deltaT;
#endif
cSim.pPosqP[pos] = apos;
cSim.pVelm4[pos] = velocity;
pos += blockDim.x * gridDim.x;
}
}
void kVerletUpdatePart1(gpuContext gpu) void kVerletUpdatePart1(gpuContext gpu)
{ {
...@@ -166,105 +73,6 @@ void kVerletUpdatePart1(gpuContext gpu) ...@@ -166,105 +73,6 @@ void kVerletUpdatePart1(gpuContext gpu)
} }
} }
__global__ void kVerletUpdatePart2_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.atoms)
{
float4 velocity = cSim.pVelm4[pos];
float4 apos = cSim.pPosq[pos];
float4 xPrime = cSim.pPosqP[pos];
#ifndef DeltaShake
velocity.x = cSim.oneOverDeltaT*(xPrime.x-apos.x);
velocity.y = cSim.oneOverDeltaT*(xPrime.y-apos.y);
velocity.z = cSim.oneOverDeltaT*(xPrime.z-apos.z);
#else
velocity.x = cSim.oneOverDeltaT*(xPrime.x);
velocity.y = cSim.oneOverDeltaT*(xPrime.y);
velocity.z = cSim.oneOverDeltaT*(xPrime.z);
xPrime.x += apos.x;
xPrime.y += apos.y;
xPrime.z += apos.z;
#endif
cSim.pPosq[pos] = xPrime;
cSim.pVelm4[pos] = velocity;
pos += blockDim.x * gridDim.x;
}
}
__global__ void kVerletUpdatePart2CM_kernel()
{
extern __shared__ float3 sCM[];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
float3 CM = {0.0f, 0.0f, 0.0f};
while (pos < cSim.atoms)
{
float4 velocity = cSim.pVelm4[pos];
float4 apos = cSim.pPosq[pos];
float4 xPrime = cSim.pPosqP[pos];
float mass = 1.0f / velocity.w;
#ifndef DeltaShake
velocity.x = cSim.oneOverDeltaT*(xPrime.x-apos.x);
velocity.y = cSim.oneOverDeltaT*(xPrime.y-apos.y);
velocity.z = cSim.oneOverDeltaT*(xPrime.z-apos.z);
#else
velocity.x = cSim.oneOverDeltaT*(xPrime.x);
velocity.y = cSim.oneOverDeltaT*(xPrime.y);
velocity.z = cSim.oneOverDeltaT*(xPrime.z);
xPrime.x += apos.x;
xPrime.y += apos.y;
xPrime.z += apos.z;
#endif
CM.x += mass * velocity.x;
CM.y += mass * velocity.y;
CM.z += mass * velocity.z;
cSim.pPosq[pos] = xPrime;
cSim.pVelm4[pos] = velocity;
pos += blockDim.x * gridDim.x;
}
// Scale CM
CM.x *= cSim.inverseTotalMass;
CM.y *= cSim.inverseTotalMass;
CM.z *= cSim.inverseTotalMass;
sCM[threadIdx.x] = CM;
__syncthreads();
// Reduce CM for CTA
unsigned int offset = 1;
unsigned int mask = 1;
while (offset < blockDim.x)
{
if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
{
sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
}
mask = 2 * mask + 1;
offset *= 2;
__syncthreads();
}
if (threadIdx.x == 0)
{
float4 CM;
CM.x = sCM[0].x;
CM.y = sCM[0].y;
CM.z = sCM[0].z;
CM.w = 0.0f;
cSim.pLinearMomentum[blockIdx.x] = CM;
}
}
void kVerletUpdatePart2(gpuContext gpu) void kVerletUpdatePart2(gpuContext gpu)
{ {
// printf("kVerletUpdatePart2\n"); // printf("kVerletUpdatePart2\n");
......
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernels for Verlet integration. It is included
* several times in kVerletUpdate.cu with different #defines to generate
* different versions of the kernels.
*/
#ifdef REMOVE_CM
__global__ void kVerletUpdatePart1CM_kernel()
#else
__global__ void kVerletUpdatePart1_kernel()
#endif
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
#ifdef REMOVE_CM
extern __shared__ float3 sCM[];
float3 CM = { 0.0f, 0.0f, 0.0f};
float4 CM1 = { 0.0f, 0.0f, 0.0f, 0.0f };
// Read CM outputs from previous step
unsigned int cpos = threadIdx.x;
while (cpos < gridDim.x)
{
CM1 = cSim.pLinearMomentum[cpos];
CM.x += CM1.x;
CM.y += CM1.y;
CM.z += CM1.z;
cpos += blockDim.x;
}
sCM[threadIdx.x].x = CM.x;
sCM[threadIdx.x].y = CM.y;
sCM[threadIdx.x].z = CM.z;
__syncthreads();
// Reduce CM
unsigned int offset = 1;
unsigned int mask = 1;
while (offset < blockDim.x)
{
if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
{
sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
}
mask = 2 * mask + 1;
offset *= 2;
__syncthreads();
}
#endif
while (pos < cSim.atoms)
{
float4 apos = cSim.pPosq[pos];
float4 velocity = cSim.pVelm4[pos];
float4 force = cSim.pForce4[pos];
float dtOverMass = cSim.deltaT*velocity.w;
cSim.pOldPosq[pos] = apos;
velocity.x += dtOverMass*force.x;
velocity.y += dtOverMass*force.y;
velocity.z += dtOverMass*force.z;
#ifdef REMOVE_CM
velocity.x -= sCM[0].x;
velocity.y -= sCM[0].y;
velocity.z -= sCM[0].z;
#endif
apos.x = velocity.x*cSim.deltaT;
apos.y = velocity.y*cSim.deltaT;
apos.z = velocity.z*cSim.deltaT;
cSim.pPosqP[pos] = apos;
cSim.pVelm4[pos] = velocity;
pos += blockDim.x * gridDim.x;
}
}
#ifdef REMOVE_CM
__global__ void kVerletUpdatePart2CM_kernel()
#else
__global__ void kVerletUpdatePart2_kernel()
#endif
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
#ifdef REMOVE_CM
extern __shared__ float3 sCM[];
float3 CM = {0.0f, 0.0f, 0.0f};
#endif
while (pos < cSim.atoms)
{
float4 velocity = cSim.pVelm4[pos];
float4 apos = cSim.pPosq[pos];
float4 xPrime = cSim.pPosqP[pos];
velocity.x = cSim.oneOverDeltaT*(xPrime.x);
velocity.y = cSim.oneOverDeltaT*(xPrime.y);
velocity.z = cSim.oneOverDeltaT*(xPrime.z);
xPrime.x += apos.x;
xPrime.y += apos.y;
xPrime.z += apos.z;
#ifdef REMOVE_CM
float mass = 1.0f / velocity.w;
CM.x += mass * velocity.x;
CM.y += mass * velocity.y;
CM.z += mass * velocity.z;
#endif
cSim.pPosq[pos] = xPrime;
cSim.pVelm4[pos] = velocity;
pos += blockDim.x * gridDim.x;
}
#ifdef REMOVE_CM
// Scale CM
CM.x *= cSim.inverseTotalMass;
CM.y *= cSim.inverseTotalMass;
CM.z *= cSim.inverseTotalMass;
sCM[threadIdx.x] = CM;
__syncthreads();
// Reduce CM for CTA
unsigned int offset = 1;
unsigned int mask = 1;
while (offset < blockDim.x)
{
if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x))
{
sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x;
sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y;
sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z;
}
mask = 2 * mask + 1;
offset *= 2;
__syncthreads();
}
if (threadIdx.x == 0)
{
float4 CM;
CM.x = sCM[0].x;
CM.y = sCM[0].y;
CM.z = sCM[0].z;
CM.w = 0.0f;
cSim.pLinearMomentum[blockIdx.x] = CM;
}
#endif
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment