/* -------------------------------------------------------------------------- * * OpenMM * * -------------------------------------------------------------------------- * * This is part of the OpenMM molecular simulation toolkit originating from * * Simbios, the NIH National Center for Physics-Based Simulation of * * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * * Portions copyright (c) 2009 Stanford University and the Authors. * * Authors: Scott Le Grand, Peter Eastman * * Contributors: * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU Lesser General Public License as published * * by the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU Lesser General Public License for more details. * * * * You should have received a copy of the GNU Lesser General Public License * * along with this program. If not, see . * * -------------------------------------------------------------------------- */ #include #include #include #include #include #include //#include using namespace std; #define DeltaShake #include "gputypes.h" struct Atom { float3 rij1; float3 rij2; float3 rij3; float M; float d2; float InvMassI; float rij1sq; float rij2sq; float rij3sq; }; static __constant__ cudaGmxSimulation cSim; void SetUpdateShakeHSim(gpuContext gpu) { cudaError_t status; status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation)); RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed"); } void GetUpdateShakeHSim(gpuContext gpu) { cudaError_t status; status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation)); RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed"); } __global__ void kUpdatePart1_kernel() { unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; unsigned int rpos = cSim.pRandomPosition[blockIdx.x]; __syncthreads(); while (pos < cSim.atoms) { float4 velocity = cSim.pVelm4[pos]; float4 xVector = cSim.pxVector4[pos]; float4 random4a = cSim.pRandom4a[rpos + pos]; float2 random2a = cSim.pRandom2a[rpos + pos]; float4 apos = cSim.pPosq[pos]; float4 force = cSim.pForce4[pos]; float3 Vmh; float sqrtInvMass = sqrt(velocity.w); Vmh.x = xVector.x * cSim.DOverTauC + sqrtInvMass * random4a.x; Vmh.y = xVector.y * cSim.DOverTauC + sqrtInvMass * random4a.y; Vmh.z = xVector.z * cSim.DOverTauC + sqrtInvMass * random4a.z; float4 vVector; vVector.x = sqrtInvMass * random4a.w; vVector.y = sqrtInvMass * random2a.x; vVector.z = sqrtInvMass * random2a.y; vVector.w = 0.0f; cSim.pvVector4[pos] = vVector; velocity.x = velocity.x * cSim.EM + velocity.w * force.x * cSim.TauOneMinusEM + vVector.x - cSim.EM * Vmh.x; velocity.y = velocity.y * cSim.EM + velocity.w * force.y * cSim.TauOneMinusEM + vVector.y - cSim.EM * Vmh.y; velocity.z = velocity.z * cSim.EM + velocity.w * force.z * cSim.TauOneMinusEM + vVector.z - cSim.EM * Vmh.z; cSim.pOldPosq[pos] = apos; #ifndef DeltaShake apos.x += velocity.x * cSim.fix1; apos.y += velocity.y * cSim.fix1; apos.z += velocity.z * cSim.fix1; #else apos.x = velocity.x * cSim.fix1; apos.y = velocity.y * cSim.fix1; apos.z = velocity.z * cSim.fix1; #endif cSim.pPosqP[pos] = apos; cSim.pVelm4[pos] = velocity; pos += blockDim.x * gridDim.x; } } __global__ void kUpdatePart1CM_kernel() { extern __shared__ float3 sCM[]; unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; unsigned int rpos = cSim.pRandomPosition[blockIdx.x]; float3 CM = { 0.0f, 0.0f, 0.0f}; float4 CM1 = { 0.0f, 0.0f, 0.0f, 0.0f }; // Read CM outputs from previous step unsigned int cpos = threadIdx.x; #if 0 float4 CM2 = { 0.0f, 0.0f, 0.0f, 0.0f }; float4 CM3 = { 0.0f, 0.0f, 0.0f, 0.0f }; float4 CM4 = { 0.0f, 0.0f, 0.0f, 0.0f }; if (cpos < gridDim.x) CM1 = cSim.pLinearMomentum[cpos]; cpos += gridDim.x; if (cpos < gridDim.x) CM2 = cSim.pLinearMomentum[cpos]; cpos += gridDim.x; if (cpos < gridDim.x) CM3 = cSim.pLinearMomentum[cpos]; cpos += gridDim.x; if (cpos < gridDim.x) CM4 = cSim.pLinearMomentum[cpos]; sCM[threadIdx.x].x = CM1.x + CM2.x + CM3.x + CM4.x; sCM[threadIdx.x].y = CM1.y + CM2.y + CM3.y + CM4.y; sCM[threadIdx.x].z = CM1.z + CM2.z + CM3.z + CM4.z; #else while (cpos < gridDim.x) { CM1 = cSim.pLinearMomentum[cpos]; CM.x += CM1.x; CM.y += CM1.y; CM.z += CM1.z; cpos += blockDim.x; } sCM[threadIdx.x].x = CM.x; sCM[threadIdx.x].y = CM.y; sCM[threadIdx.x].z = CM.z; #endif __syncthreads(); // Reduce CM unsigned int offset = 1; unsigned int mask = 1; while (offset < blockDim.x) { if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x)) { sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x; sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y; sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z; } mask = 2 * mask + 1; offset *= 2; __syncthreads(); } while (pos < cSim.atoms) { float4 velocity = cSim.pVelm4[pos]; float4 xVector = cSim.pxVector4[pos]; float4 random4a = cSim.pRandom4a[rpos + pos]; float2 random2a = cSim.pRandom2a[rpos + pos]; float4 apos = cSim.pPosq[pos]; float4 force = cSim.pForce4[pos]; float3 Vmh; float sqrtInvMass = sqrt(velocity.w); Vmh.x = xVector.x * cSim.DOverTauC + sqrtInvMass * random4a.x; Vmh.y = xVector.y * cSim.DOverTauC + sqrtInvMass * random4a.y; Vmh.z = xVector.z * cSim.DOverTauC + sqrtInvMass * random4a.z; float4 vVector; vVector.x = sqrtInvMass * random4a.w; vVector.y = sqrtInvMass * random2a.x; vVector.z = sqrtInvMass * random2a.y; vVector.w = 0.0f; cSim.pvVector4[pos] = vVector; velocity.x = velocity.x * cSim.EM + velocity.w * force.x * cSim.TauOneMinusEM + vVector.x - cSim.EM * Vmh.x - sCM[0].x; velocity.y = velocity.y * cSim.EM + velocity.w * force.y * cSim.TauOneMinusEM + vVector.y - cSim.EM * Vmh.y - sCM[0].y; velocity.z = velocity.z * cSim.EM + velocity.w * force.z * cSim.TauOneMinusEM + vVector.z - cSim.EM * Vmh.z - sCM[0].z; cSim.pOldPosq[pos] = apos; #ifndef DeltaShake apos.x += velocity.x * cSim.fix1; apos.y += velocity.y * cSim.fix1; apos.z += velocity.z * cSim.fix1; #else apos.x = velocity.x * cSim.fix1; apos.y = velocity.y * cSim.fix1; apos.z = velocity.z * cSim.fix1; #endif cSim.pPosqP[pos] = apos; cSim.pVelm4[pos] = velocity; pos += blockDim.x * gridDim.x; } } void kUpdatePart1(gpuContext gpu) { // printf("kUpdatePart1\n"); #if 0 static int iteration = 0; if (iteration == 0) { gpu->psPosq4->Download(); gpu->psVelm4->Download(); printf("# %d atoms\n", gpu->natoms); for (int i = 0; i < gpu->natoms; i++) { printf("%5d %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f %11.5f\n", i, gpu->psPosq4->_pSysStream[0][i].x, gpu->psPosq4->_pSysStream[0][i].y, gpu->psPosq4->_pSysStream[0][i].z, gpu->psPosq4->_pSysStream[0][i].w, gpu->psVelm4->_pSysStream[0][i].x, gpu->psVelm4->_pSysStream[0][i].y, gpu->psVelm4->_pSysStream[0][i].z, gpu->psVelm4->_pSysStream[0][i].w ); } } iteration++; #endif #if 0 static const float KILO = 1e3; // Thousand static const float BOLTZMANN = 1.380658e-23f; // (J/K) static const float AVOGADRO = 6.0221367e23f; // () static const float RGAS = BOLTZMANN * AVOGADRO; // (J/(mol K)) static const float BOLTZ = (RGAS / KILO); // (kJ/(mol K)) static int iteration = 0; // Check T if (iteration % 1000 == 0) { gpu->psVelm4->Download(); float ke = 0.0f; for (int i = 0; i < gpu->natoms; i++) { float vx = gpu->psVelm4->_pSysStream[0][i].x; float vy = gpu->psVelm4->_pSysStream[0][i].y; float vz = gpu->psVelm4->_pSysStream[0][i].z; float m = 1.0f / gpu->psVelm4->_pSysStream[0][i].w; ke += m * (vx * vx + vy * vy + vz * vz); } float T = ke / (BOLTZ * gpu->sim.degreesOfFreedom); printf("Iteration %d, Temperature is %f\n", iteration, T); } iteration++; #endif if (gpu->bRemoveCM) { kUpdatePart1CM_kernel<<sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>(); LAUNCHERROR("kUpdatePart1CM"); gpu->bRemoveCM = false; #if 0 gpu->psLinearMomentum->Download(); gpu->psVelm4->Download(); float3 mv = {0.0f, 0.0f, 0.0f}; for (int i = 0; i < gpu->natoms; i++) { float mass = 1.0f / gpu->psVelm4->_pSysStream[0][i].w; mv.x += mass * gpu->psVelm4->_pSysStream[0][i].x; mv.y += mass * gpu->psVelm4->_pSysStream[0][i].y; mv.z += mass * gpu->psVelm4->_pSysStream[0][i].z; } mv.x *= gpu->sim.inverseTotalMass; mv.y *= gpu->sim.inverseTotalMass; mv.z *= gpu->sim.inverseTotalMass; float3 mv1 = {0.0f, 0.0f, 0.0f}; for (int i = 0; i < gpu->sim.blocks; i++) { mv1.x += gpu->psLinearMomentum->_pSysStream[0][i].x; mv1.y += gpu->psLinearMomentum->_pSysStream[0][i].y; mv1.z += gpu->psLinearMomentum->_pSysStream[0][i].z; } printf("%11.5f %11.5f %11.5f | %11.5f %11.5f %11.5f\n", mv.x, mv.y, mv.z, mv1.x, mv1.y, mv1.z); #endif } else { kUpdatePart1_kernel<<sim.blocks, gpu->sim.update_threads_per_block>>>(); LAUNCHERROR("kUpdatePart1"); } } __global__ void kApplyFirstShake_kernel() { __shared__ Atom sA[G8X_THREADS_PER_BLOCK]; Atom* psA = &sA[threadIdx.x]; unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; while (pos < cSim.ShakeConstraints) { int4 atomID = cSim.pShakeID[pos]; float4 params = cSim.pShakeParameter[pos]; float4 apos = cSim.pOldPosq[atomID.x]; float4 xpi = cSim.pPosqP[atomID.x]; float4 apos1 = cSim.pOldPosq[atomID.y]; float4 xpj1 = cSim.pPosqP[atomID.y]; float4 apos2 = {0.0f, 0.0f, 0.0f, 0.0f}; float4 xpj2 = {0.0f, 0.0f, 0.0f, 0.0f}; psA->InvMassI = params.x; psA->M = params.y; psA->d2 = params.z; float invMassJ = params.w; if (atomID.z != -1) { apos2 = cSim.pOldPosq[atomID.z]; xpj2 = cSim.pPosqP[atomID.z]; } float4 apos3 = {0.0f, 0.0f, 0.0f, 0.0f}; float4 xpj3 = {0.0f, 0.0f, 0.0f, 0.0f}; if (atomID.w != -1) { apos3 = cSim.pOldPosq[atomID.w]; xpj3 = cSim.pPosqP[atomID.w]; } float3 xi, xj1, xj2, xj3; xi.x = apos.x; xi.y = apos.y; xi.z = apos.z; xj1.x = apos1.x; xj1.y = apos1.y; xj1.z = apos1.z; xj2.x = apos2.x; xj2.y = apos2.y; xj2.z = apos2.z; xj3.x = apos3.x; xj3.y = apos3.y; xj3.z = apos3.z; #ifndef DeltaShake xpi.x -= xi.x; xpi.y -= xi.y; xpi.z -= xi.z; xpj1.x -= xj1.x; xpj1.y -= xj1.y; xpj1.z -= xj1.z; xpj2.x -= xj2.x; xpj2.y -= xj2.y; xpj2.z -= xj2.z; xpj3.x -= xj3.x; xpj3.y -= xj3.y; xpj3.z -= xj3.z; #endif psA->rij1.x = xi.x - xj1.x; psA->rij1.y = xi.y - xj1.y; psA->rij1.z = xi.z - xj1.z; psA->rij2.x = xi.x - xj2.x; psA->rij2.y = xi.y - xj2.y; psA->rij2.z = xi.z - xj2.z; psA->rij3.x = xi.x - xj3.x; psA->rij3.y = xi.y - xj3.y; psA->rij3.z = xi.z - xj3.z; psA->rij1sq = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z; psA->rij2sq = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z; psA->rij3sq = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z; float ld1 = psA->d2 - psA->rij1sq; float ld2 = psA->d2 - psA->rij2sq; float ld3 = psA->d2 - psA->rij3sq; bool converged = false; int iteration = 0; while (iteration < 15 && !converged) { converged = true; float3 rpij; rpij.x = xpi.x - xpj1.x; rpij.y = xpi.y - xpj1.y; rpij.z = xpi.z - xpj1.z; float rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z; float rrpr = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z; float diff = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance); if (diff >= 1.0f) { float acor = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq); float3 dr; dr.x = psA->rij1.x * acor; dr.y = psA->rij1.y * acor; dr.z = psA->rij1.z * acor; xpi.x += dr.x * psA->InvMassI; xpi.y += dr.y * psA->InvMassI; xpi.z += dr.z * psA->InvMassI; xpj1.x -= dr.x * invMassJ; xpj1.y -= dr.y * invMassJ; xpj1.z -= dr.z * invMassJ; converged = false; } if (atomID.z != -1) { rpij.x = xpi.x - xpj2.x; rpij.y = xpi.y - xpj2.y; rpij.z = xpi.z - xpj2.z; rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z; rrpr = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z; diff = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance); if (diff >= 1.0f) { float acor = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq); float3 dr; dr.x = psA->rij2.x * acor; dr.y = psA->rij2.y * acor; dr.z = psA->rij2.z * acor; xpi.x += dr.x * psA->InvMassI; xpi.y += dr.y * psA->InvMassI; xpi.z += dr.z * psA->InvMassI; xpj2.x -= dr.x * invMassJ; xpj2.y -= dr.y * invMassJ; xpj2.z -= dr.z * invMassJ; converged = false; } } if (atomID.w != -1) { rpij.x = xpi.x - xpj3.x; rpij.y = xpi.y - xpj3.y; rpij.z = xpi.z - xpj3.z; rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z; rrpr = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z; diff = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance); if (diff >= 1.0f) { float acor = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq); float3 dr; dr.x = psA->rij3.x * acor; dr.y = psA->rij3.y * acor; dr.z = psA->rij3.z * acor; xpi.x += dr.x * psA->InvMassI; xpi.y += dr.y * psA->InvMassI; xpi.z += dr.z * psA->InvMassI; xpj3.x -= dr.x * invMassJ; xpj3.y -= dr.y * invMassJ; xpj3.z -= dr.z * invMassJ; converged = false; } } iteration++; } #ifndef DeltaShake xpi.x += xi.x; xpi.y += xi.y; xpi.z += xi.z; xpj1.x += xj1.x; xpj1.y += xj1.y; xpj1.z += xj1.z; xpj2.x += xj2.x; xpj2.y += xj2.y; xpj2.z += xj2.z; xpj3.x += xj3.x; xpj3.y += xj3.y; xpj3.z += xj3.z; #endif cSim.pPosqP[atomID.x] = xpi; cSim.pPosqP[atomID.y] = xpj1; if (atomID.z != -1) cSim.pPosqP[atomID.z] = xpj2; if (atomID.w != -1) cSim.pPosqP[atomID.w] = xpj3; pos += blockDim.x * gridDim.x; } } void kApplyFirstShake(gpuContext gpu) { // printf("kApplyFirstShake\n"); if (gpu->sim.ShakeConstraints > 0) { kApplyFirstShake_kernel<<sim.blocks, gpu->sim.shake_threads_per_block>>>(); LAUNCHERROR("kApplyFirstShake"); } } __global__ void kUpdatePart2_kernel() { unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; unsigned int rpos = cSim.pRandomPosition[blockIdx.x]; __syncthreads(); while (pos < cSim.atoms) { float4 velocity = cSim.pVelm4[pos]; #ifndef DeltaShake float4 apos = cSim.pPosq[pos]; #endif float4 xPrime = cSim.pPosqP[pos]; float4 vVector = cSim.pvVector4[pos]; float4 xVector; float4 random4b = cSim.pRandom4b[rpos + pos]; float2 random2b = cSim.pRandom2b[rpos + pos]; float3 Xmh; float sqrtInvMass = sqrt(velocity.w); #ifdef DeltaShake velocity.x = xPrime.x * cSim.oneOverFix1; velocity.y = xPrime.y * cSim.oneOverFix1; velocity.z = xPrime.z * cSim.oneOverFix1; #else velocity.x = (xPrime.x - apos.x) * cSim.oneOverFix1; velocity.y = (xPrime.y - apos.y) * cSim.oneOverFix1; velocity.z = (xPrime.z - apos.z) * cSim.oneOverFix1; #endif Xmh.x = vVector.x * cSim.TauDOverEMMinusOne + sqrtInvMass * random4b.x; Xmh.y = vVector.y * cSim.TauDOverEMMinusOne + sqrtInvMass * random4b.y; Xmh.z = vVector.z * cSim.TauDOverEMMinusOne + sqrtInvMass * random4b.z; xVector.x = sqrtInvMass * random4b.w; xVector.y = sqrtInvMass * random2b.x; xVector.z = sqrtInvMass * random2b.y; xPrime.x += xVector.x - Xmh.x; xPrime.y += xVector.y - Xmh.y; xPrime.z += xVector.z - Xmh.z; cSim.pPosq[pos] = xPrime; cSim.pVelm4[pos] = velocity; cSim.pxVector4[pos] = xVector; pos += blockDim.x * gridDim.x; } // Update random position pointer if (threadIdx.x == 0) { rpos += cSim.paddedNumberOfAtoms; if (rpos > cSim.randoms) rpos -= cSim.randoms; cSim.pRandomPosition[blockIdx.x] = rpos; } } __global__ void kUpdatePart2CM_kernel() { extern __shared__ float3 sCM[]; unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; unsigned int rpos = cSim.pRandomPosition[blockIdx.x]; float3 CM = {0.0f, 0.0f, 0.0f}; __syncthreads(); while (pos < cSim.atoms) { float4 velocity = cSim.pVelm4[pos]; #ifndef DeltaShake float4 apos = cSim.pPosq[pos]; #endif float4 xPrime = cSim.pPosqP[pos]; float4 vVector = cSim.pvVector4[pos]; float4 xVector; float4 random4b = cSim.pRandom4b[rpos + pos]; float2 random2b = cSim.pRandom2b[rpos + pos]; float3 Xmh; float mass = 1.0f / velocity.w; float sqrtInvMass = sqrt(velocity.w); #ifdef DeltaShake velocity.x = xPrime.x * cSim.oneOverFix1; velocity.y = xPrime.y * cSim.oneOverFix1; velocity.z = xPrime.z * cSim.oneOverFix1; #else velocity.x = (xPrime.x - apos.x) * cSim.oneOverFix1; velocity.y = (xPrime.y - apos.y) * cSim.oneOverFix1; velocity.z = (xPrime.z - apos.z) * cSim.oneOverFix1; #endif CM.x += mass * velocity.x; CM.y += mass * velocity.y; CM.z += mass * velocity.z; Xmh.x = vVector.x * cSim.TauDOverEMMinusOne + sqrtInvMass * random4b.x; Xmh.y = vVector.y * cSim.TauDOverEMMinusOne + sqrtInvMass * random4b.y; Xmh.z = vVector.z * cSim.TauDOverEMMinusOne + sqrtInvMass * random4b.z; xVector.x = sqrtInvMass * random4b.w; xVector.y = sqrtInvMass * random2b.x; xVector.z = sqrtInvMass * random2b.y; xPrime.x += xVector.x - Xmh.x; xPrime.y += xVector.y - Xmh.y; xPrime.z += xVector.z - Xmh.z; cSim.pPosq[pos] = xPrime; cSim.pVelm4[pos] = velocity; cSim.pxVector4[pos] = xVector; pos += blockDim.x * gridDim.x; } // Update random position pointer if (threadIdx.x == 0) { rpos += cSim.paddedNumberOfAtoms; if (rpos > cSim.randoms) rpos -= cSim.randoms; cSim.pRandomPosition[blockIdx.x] = rpos; } // Scale CM CM.x *= cSim.inverseTotalMass; CM.y *= cSim.inverseTotalMass; CM.z *= cSim.inverseTotalMass; sCM[threadIdx.x] = CM; __syncthreads(); // Reduce CM for CTA unsigned int offset = 1; unsigned int mask = 1; while (offset < blockDim.x) { if (((threadIdx.x & mask) == 0) && (threadIdx.x + offset < blockDim.x)) { sCM[threadIdx.x].x += sCM[threadIdx.x + offset].x; sCM[threadIdx.x].y += sCM[threadIdx.x + offset].y; sCM[threadIdx.x].z += sCM[threadIdx.x + offset].z; } mask = 2 * mask + 1; offset *= 2; __syncthreads(); } if (threadIdx.x == 0) { float4 CM; CM.x = sCM[0].x; CM.y = sCM[0].y; CM.z = sCM[0].z; CM.w = 0.0f; cSim.pLinearMomentum[blockIdx.x] = CM; } } extern void kGenerateRandoms(gpuContext gpu); void kUpdatePart2(gpuContext gpu) { // printf("kUpdatePart2\n"); if (gpu->bCalculateCM) { kUpdatePart2CM_kernel<<sim.blocks, gpu->sim.update_threads_per_block, gpu->sim.update_threads_per_block * sizeof(float3)>>>(); LAUNCHERROR("kUpdatePart2CM"); gpu->bCalculateCM = false; gpu->bRemoveCM = true; #if 0 gpu->psLinearMomentum->Download(); gpu->psVelm4->Download(); float3 mv = {0.0f, 0.0f, 0.0f}; for (int i = 0; i < gpu->natoms; i++) { float mass = 1.0f / gpu->psVelm4->_pSysStream[0][i].w; mv.x += mass * gpu->psVelm4->_pSysStream[0][i].x; mv.y += mass * gpu->psVelm4->_pSysStream[0][i].y; mv.z += mass * gpu->psVelm4->_pSysStream[0][i].z; } mv.x *= gpu->sim.inverseTotalMass; mv.y *= gpu->sim.inverseTotalMass; mv.z *= gpu->sim.inverseTotalMass; float3 mv1 = {0.0f, 0.0f, 0.0f}; for (int i = 0; i < gpu->sim.blocks; i++) { mv1.x += gpu->psLinearMomentum->_pSysStream[0][i].x; mv1.y += gpu->psLinearMomentum->_pSysStream[0][i].y; mv1.z += gpu->psLinearMomentum->_pSysStream[0][i].z; } printf("%11.5f %11.5f %11.5f | %11.5f %11.5f %11.5f\n", mv.x, mv.y, mv.z, mv1.x, mv1.y, mv1.z); #endif } else { kUpdatePart2_kernel<<sim.blocks, gpu->sim.update_threads_per_block>>>(); LAUNCHERROR("kUpdatePart2"); } // Update randoms if necessary gpu->iterations++; if (gpu->iterations == gpu->sim.randomIterations) { kGenerateRandoms(gpu); gpu->iterations = 0; } } __global__ void kApplySecondShake_kernel() { __shared__ Atom sA[G8X_THREADS_PER_BLOCK]; Atom* psA = &sA[threadIdx.x]; unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; while (pos < cSim.ShakeConstraints) { int4 atomID = cSim.pShakeID[pos]; float4 params = cSim.pShakeParameter[pos]; float4 apos = cSim.pOldPosq[atomID.x]; float4 xpi = cSim.pPosq[atomID.x]; float4 apos1 = cSim.pOldPosq[atomID.y]; float4 xpj1 = cSim.pPosq[atomID.y]; float4 apos2 = {0.0f, 0.0f, 0.0f, 0.0f}; float4 xpj2 = {0.0f, 0.0f, 0.0f, 0.0f}; psA->InvMassI = params.x; psA->M = params.y; psA->d2 = params.z; float invMassJ = params.w; if (atomID.z != -1) { apos2 = cSim.pOldPosq[atomID.z]; xpj2 = cSim.pPosq[atomID.z]; } float4 apos3 = {0.0f, 0.0f, 0.0f, 0.0f}; float4 xpj3 = {0.0f, 0.0f, 0.0f, 0.0f}; if (atomID.w != -1) { apos3 = cSim.pOldPosq[atomID.w]; xpj3 = cSim.pPosq[atomID.w]; } float3 xi, xj1, xj2, xj3; xi.x = apos.x; xi.y = apos.y; xi.z = apos.z; xj1.x = apos1.x; xj1.y = apos1.y; xj1.z = apos1.z; xj2.x = apos2.x; xj2.y = apos2.y; xj2.z = apos2.z; xj3.x = apos3.x; xj3.y = apos3.y; xj3.z = apos3.z; #ifndef DeltaShake xpi.x -= xi.x; xpi.y -= xi.y; xpi.z -= xi.z; xpj1.x -= xj1.x; xpj1.y -= xj1.y; xpj1.z -= xj1.z; xpj2.x -= xj2.x; xpj2.y -= xj2.y; xpj2.z -= xj2.z; xpj3.x -= xj3.x; xpj3.y -= xj3.y; xpj3.z -= xj3.z; #endif psA->rij1.x = xi.x - xj1.x; psA->rij1.y = xi.y - xj1.y; psA->rij1.z = xi.z - xj1.z; psA->rij2.x = xi.x - xj2.x; psA->rij2.y = xi.y - xj2.y; psA->rij2.z = xi.z - xj2.z; psA->rij3.x = xi.x - xj3.x; psA->rij3.y = xi.y - xj3.y; psA->rij3.z = xi.z - xj3.z; psA->rij1sq = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z; psA->rij2sq = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z; psA->rij3sq = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z; float ld1 = psA->d2 - psA->rij1sq; float ld2 = psA->d2 - psA->rij2sq; float ld3 = psA->d2 - psA->rij3sq; bool converged = false; int iteration = 0; while (iteration < 15 && !converged) { converged = true; float3 rpij; rpij.x = xpi.x - xpj1.x; rpij.y = xpi.y - xpj1.y; rpij.z = xpi.z - xpj1.z; float rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z; float rrpr = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z; float diff = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance ); if (diff >= 1.0f) { float acor = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq); float3 dr; dr.x = psA->rij1.x * acor; dr.y = psA->rij1.y * acor; dr.z = psA->rij1.z * acor; xpi.x += dr.x * psA->InvMassI; xpi.y += dr.y * psA->InvMassI; xpi.z += dr.z * psA->InvMassI; xpj1.x -= dr.x * invMassJ; xpj1.y -= dr.y * invMassJ; xpj1.z -= dr.z * invMassJ; converged = false; } if (atomID.z != -1) { rpij.x = xpi.x - xpj2.x; rpij.y = xpi.y - xpj2.y; rpij.z = xpi.z - xpj2.z; rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z; rrpr = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z; diff = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance ); if (diff >= 1.0f) { float acor = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq); float3 dr; dr.x = psA->rij2.x * acor; dr.y = psA->rij2.y * acor; dr.z = psA->rij2.z * acor; xpi.x += dr.x * psA->InvMassI; xpi.y += dr.y * psA->InvMassI; xpi.z += dr.z * psA->InvMassI; xpj2.x -= dr.x * invMassJ; xpj2.y -= dr.y * invMassJ; xpj2.z -= dr.z * invMassJ; converged = false; } } if (atomID.w != -1) { rpij.x = xpi.x - xpj3.x; rpij.y = xpi.y - xpj3.y; rpij.z = xpi.z - xpj3.z; rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z; rrpr = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z; diff = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance ); if (diff >= 1.0f) { float acor = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq); float3 dr; dr.x = psA->rij3.x * acor; dr.y = psA->rij3.y * acor; dr.z = psA->rij3.z * acor; xpi.x += dr.x * psA->InvMassI; xpi.y += dr.y * psA->InvMassI; xpi.z += dr.z * psA->InvMassI; xpj3.x -= dr.x * invMassJ; xpj3.y -= dr.y * invMassJ; xpj3.z -= dr.z * invMassJ; converged = false; } } iteration++; } xpi.x += xi.x; xpi.y += xi.y; xpi.z += xi.z; xpj1.x += xj1.x; xpj1.y += xj1.y; xpj1.z += xj1.z; xpj2.x += xj2.x; xpj2.y += xj2.y; xpj2.z += xj2.z; xpj3.x += xj3.x; xpj3.y += xj3.y; xpj3.z += xj3.z; cSim.pPosq[atomID.x] = xpi; cSim.pPosq[atomID.y] = xpj1; if (atomID.z != -1) cSim.pPosq[atomID.z] = xpj2; if (atomID.w != -1) cSim.pPosq[atomID.w] = xpj3; pos += blockDim.x * gridDim.x; } } __global__ void kApplyNoShake_kernel() { unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; while (pos < cSim.NonShakeConstraints) { int atomID = cSim.pNonShakeID[pos]; float4 apos = cSim.pOldPosq[atomID]; float4 xpi = cSim.pPosq[atomID]; xpi.x += apos.x; xpi.y += apos.y; xpi.z += apos.z; cSim.pPosq[atomID] = xpi; pos += blockDim.x * gridDim.x; } } void kCPUShake2(gpuContext gpu) { } void kApplySecondShake(gpuContext gpu) { // printf("kApplySecondShake\n"); // kCPUShake2(gpu); if (gpu->sim.ShakeConstraints > 0) { kApplySecondShake_kernel<<sim.blocks, gpu->sim.shake_threads_per_block>>>(); LAUNCHERROR("kApplySecondShake"); } // handle non-Shake atoms #ifdef DeltaShake if (gpu->sim.NonShakeConstraints > 0) { //fprintf( gpu->log, "kApplyNoShake_kernel %d %d \n", gpu->sim.blocks, gpu->sim.nonshake_threads_per_block); fflush( gpu->log ); kApplyNoShake_kernel<<sim.blocks, gpu->sim.nonshake_threads_per_block>>>(); LAUNCHERROR("kApplyNoShake"); } #endif }