/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
 * Portions copyright (c) 2009 Stanford University and the Authors.           *
 * Authors: Scott Le Grand, Peter Eastman                                     *
 * Contributors:                                                              *
 *                                                                            *
 * Permission is hereby granted, free of charge, to any person obtaining a    *
 * copy of this software and associated documentation files (the "Software"), *
 * to deal in the Software without restriction, including without limitation  *
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
 * and/or sell copies of the Software, and to permit persons to whom the      *
 * Software is furnished to do so, subject to the following conditions:       *
 *                                                                            *
 * The above copyright notice and this permission notice shall be included in *
 * all copies or substantial portions of the Software.                        *
 *                                                                            *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
 * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

#include <cuda.h>
#include <vector_functions.h>
using namespace std;

#include "gputypes.h"


static __constant__ cudaGmxSimulation cSim;

void SetLincsSim(gpuContext gpu)
{
    cudaError_t status;
    status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
    RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
}

void GetLincsSim(gpuContext gpu)
{
    cudaError_t status;
    status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
    RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
}

__global__ void kUpdateAtomPositions_kernel(float4* atomPositions)
{
    // Update the atom positions based on the solution to the matrix equations.

    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
    while (pos < cSim.atoms)
    {
        float4 atomPos = atomPositions[pos];
        float invMass = cSim.pVelm4[pos].w;
        int num = cSim.pLincsNumAtomConstraints[pos];
        for (int i = 0; i < num; i++)
        {
            int index = pos+i*cSim.atoms;
            int constraint = cSim.pLincsAtomConstraints[index];
            float4 dir = cSim.pLincsDistance[constraint];
            float c = invMass*cSim.pLincsS[constraint]*cSim.pLincsSolution[constraint];
            c = (cSim.pLincsAtoms[constraint].x == pos ? -c : c);
            atomPos.x += c*dir.x;
            atomPos.y += c*dir.y;
            atomPos.z += c*dir.z;
        }
        atomPositions[pos] = atomPos;
        pos += blockDim.x * gridDim.x;
    }
}

__global__ void kIterateLincsMatrix_kernel(int iteration)
{
    // Perform one iteration of inverting the matrix.

    float* rhs1 = (iteration%2 == 0 ? cSim.pLincsRhs1 : cSim.pLincsRhs2);
    float* rhs2 = (iteration%2 == 0 ? cSim.pLincsRhs2 : cSim.pLincsRhs1);
    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
    while (pos < cSim.lincsConstraints)
    {
        float rhs = 0.0f;
        int num = cSim.pLincsNumConnections[pos];
        for (int i = 0; i < num; i++)
        {
            int index = pos+i*cSim.lincsConstraints;
            int otherConstraint = cSim.pLincsConnections[index];
            rhs += cSim.pLincsCoupling[index]*rhs1[otherConstraint];
        }
        rhs2[pos] = rhs;
        cSim.pLincsSolution[pos] += rhs;
        pos += blockDim.x * gridDim.x;
    }
}

__global__ void kApplyLincsPart1_kernel(float4* atomPositions, bool addOldPosition)
{
   // Calculate the direction of each constraint, along with the initial RHS and solution vectors.

    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
    while (pos < cSim.lincsConstraints)
    {
        int2 atoms = cSim.pLincsAtoms[pos];
        float4 delta1 = atomPositions[atoms.x];
        float4 delta2 = atomPositions[atoms.y];
        float4 dir = cSim.pLincsDistance[pos];
        if (addOldPosition)
        {
            float4 oldPos1 = cSim.pOldPosq[atoms.x];
            float4 oldPos2 = cSim.pOldPosq[atoms.y];
            dir.x = (oldPos1.x-oldPos2.x)+(delta1.x-delta2.x);
            dir.y = (oldPos1.y-oldPos2.y)+(delta1.y-delta2.y);
            dir.z = (oldPos1.z-oldPos2.z)+(delta1.z-delta2.z);
        }
        else
        {
            dir.x = delta1.x-delta2.x;
            dir.y = delta1.y-delta2.y;
            dir.z = delta1.z-delta2.z;
        }
        float invLength = 1.0f/sqrt(dir.x*dir.x + dir.y*dir.y + dir.z*dir.z);
        dir.x *= invLength;
        dir.y *= invLength;
        dir.z *= invLength;
        cSim.pLincsDistance[pos] = dir;
        float diff = cSim.pLincsS[pos]*(1.0f/invLength-dir.w);
        cSim.pLincsRhs1[pos] = diff;
        cSim.pLincsSolution[pos] = diff;
        pos += blockDim.x * gridDim.x;
    }
}

__global__ void kApplyLincsPart2_kernel()
{
    // Build the coupling matrix.

    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
    while (pos < cSim.lincsConstraints)
    {
        float4 dir1 = cSim.pLincsDistance[pos];
        int2 atoms1 = cSim.pLincsAtoms[pos];
        int num = cSim.pLincsNumConnections[pos];
        float s = cSim.pLincsS[pos];
        float invMass = cSim.pVelm4[atoms1.x].w;
        for (int i = 0; i < num; i++)
        {
            int index = pos+i*cSim.lincsConstraints;
            int otherConstraint = cSim.pLincsConnections[index];
            float4 dir2 = cSim.pLincsDistance[otherConstraint];
            int2 atoms2 = cSim.pLincsAtoms[otherConstraint];
            float signedMass = (atoms1.x == atoms2.x || atoms1.y == atoms2.y ? -invMass : cSim.pVelm4[atoms1.y].w);
            cSim.pLincsCoupling[index] = signedMass*s*(dir1.x*dir2.x+dir1.y*dir2.y+dir1.z*dir2.z)*cSim.pLincsS[otherConstraint];
        }
        pos += blockDim.x * gridDim.x;
    }
}

__global__ void kApplyLincsPart3_kernel(float4* atomPositions, bool addOldPosition)
{
    // Correct for rotational lengthening.

    unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
    while (pos < cSim.lincsConstraints)
    {
        int2 atoms = cSim.pLincsAtoms[pos];
        float4 delta1 = atomPositions[atoms.x];
        float4 delta2 = atomPositions[atoms.y];
        float3 delta;
        if (addOldPosition)
        {
            float4 oldPos1 = cSim.pOldPosq[atoms.x];
            float4 oldPos2 = cSim.pOldPosq[atoms.y];
            delta = make_float3((oldPos1.x-oldPos2.x)+(delta1.x-delta2.x),
                                (oldPos1.y-oldPos2.y)+(delta1.y-delta2.y),
                                (oldPos1.z-oldPos2.z)+(delta1.z-delta2.z));
        }
        else
        {
            delta = make_float3(delta1.x-delta2.x, delta1.y-delta2.y, delta1.z-delta2.z);
        }
        float distance = cSim.pLincsDistance[pos].w;
        float p2 = 2.0f*distance*distance-(delta.x*delta.x+delta.y*delta.y+delta.z*delta.z);
        p2 = (p2 < 0.0f ? 0.0f : p2);
        float diff = cSim.pLincsS[pos]*(distance-sqrt(p2));
        cSim.pLincsRhs1[pos] = diff;
        cSim.pLincsSolution[pos] = diff;
        pos += blockDim.x * gridDim.x;
    }
}

static void kApplyLincs(gpuContext gpu, float4* atomPositions, bool addOldPosition)
{
    kApplyLincsPart1_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions, addOldPosition);
    LAUNCHERROR("kApplyLincsPart1");
    kApplyLincsPart2_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>();
    LAUNCHERROR("kApplyLincsPart2");
    for (int i = 0; i < gpu->sim.lincsTerms; ++i)
    {
        kIterateLincsMatrix_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(i);
        LAUNCHERROR("kIterateLincsMatrix_kernel");
    }
    kUpdateAtomPositions_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions);
    LAUNCHERROR("kUpdateAtomPositions");
    kApplyLincsPart3_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions, addOldPosition);
    LAUNCHERROR("kApplyLincsPart3");
    for (int i = 0; i < gpu->sim.lincsTerms; ++i)
    {
        kIterateLincsMatrix_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(i);
        LAUNCHERROR("kIterateLincsMatrix_kernel");
    }
    kUpdateAtomPositions_kernel<<<gpu->sim.blocks, gpu->sim.lincs_threads_per_block>>>(atomPositions);
    LAUNCHERROR("kUpdateAtomPositions");
}

void kApplyFirstLincs(gpuContext gpu)
{
//    printf("kApplyFirstLincs\n");
    if (gpu->sim.lincsConstraints > 0)
        kApplyLincs(gpu, gpu->sim.pPosqP, true);
}

void kApplySecondLincs(gpuContext gpu)
{
//    printf("kApplySecondLincs\n");
    if (gpu->sim.lincsConstraints > 0)
        kApplyLincs(gpu, gpu->sim.pPosq, false);
}