Add files via upload

ca86f720 · zcxzcx1 · GitHub · b75ed73c · ca86f720 · ca86f720
Unverified Commit ca86f720 authored Aug 24, 2025 by zcxzcx1 Committed by GitHub Aug 24, 2025
20 changed files
--- a/mace-bench/3rdparty/SevenNet/sevenn/pair_e3gnn/pair_d3.h
+++ b/mace-bench/3rdparty/SevenNet/sevenn/pair_e3gnn/pair_d3.h
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+PairStyle(d3, PairD3)
+
+#else
+
+#ifndef LMP_PAIR_D3
+#define LMP_PAIR_D3
+
+#include <cmath>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <map>
+#include <unordered_map>
+#include <cuda_runtime.h>
+
+#include "pair.h"
+#include "utils.h"
+#include "atom.h"
+#include "domain.h"
+#include "error.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "math_extra.h"
+
+#include "pair_d3_pars.h"
+
+// Removed dependencies to STL
+// #include <stdlib.h> -> no more C style functions
+// #define _USE_MATH_DEFINES -> no predefined constants
+
+// Removed dependencies to LAMMPS
+// #include "potential_file_reader.h" -> removed, PotentialFileReader
+// #include "memory.h"     -> already no dependency for CUDA version
+
+namespace LAMMPS_NS {
+    class PairD3 : public Pair {
+    public:
+        PairD3(class LAMMPS*);
+        ~PairD3() override;
+
+        void compute(int, int) override;
+        void settings(int, char**) override;
+        void coeff(int, char**) override;
+        double init_one(int i, int j) override;
+        void init_style() override;
+
+        void write_restart(FILE*) override;
+        void read_restart(FILE*) override;
+        void write_restart_settings(FILE*) override;
+        void read_restart_settings(FILE*) override;
+
+    protected:
+        virtual void allocate();
+
+        /* ------- Read parameters ------- */
+        int find_atomic_number(std::string&);
+        int is_int_in_array(int*, int, int);
+        void read_r0ab(int*, int);
+        void get_limit_in_pars_array(int&, int&, int&, int&);
+        void read_c6ab(int*, int);
+
+        void setfuncpar_zero();
+        void setfuncpar_bj();
+        void setfuncpar_zerom();
+        void setfuncpar_bjm();
+        void setfuncpar();
+        /* ------- Read parameters ------- */
+
+        /* ------- Lattice information ------- */
+        void set_lattice_repetition_criteria(float, int*);
+        void set_lattice_vectors();
+        /* ------- Lattice information ------- */
+
+        /* ------- Initialize & Precalculate ------- */
+        void load_atom_info();
+        void precalculate_tau_array();
+        /* ------- Initialize & Precalculate ------- */
+
+        /* ------- Reallocate (when number of atoms changed) ------- */
+        void reallocate_arrays();
+        /* ------- Reallocate (when number of atoms changed) ------- */
+
+        /* ------- Coordination number ------- */
+        void get_coordination_number();
+        void get_dC6_dCNij();
+        /* ------- Coordination number ------- */
+
+        /* ------- Main workers ------- */
+        void get_forces_without_dC6_zero();
+        void get_forces_without_dC6_bj();
+        void get_forces_without_dC6_zerom();
+        void get_forces_without_dC6_bjm();
+        void get_forces_without_dC6();
+        void get_forces_with_dC6();
+        void update(int, int);
+        /* ------- Main workers ------- */
+
+        /*--------- Constants ---------*/
+        static constexpr int MAX_ELEM = 94;          // maximum of the element number
+        static constexpr int MAXC = 5;               // maximum coordination number references per element
+
+        static constexpr double AU_TO_ANG = 0.52917726; // conversion factors (atomic unit --> angstrom)
+        static constexpr double AU_TO_EV = 27.21138505; // conversion factors (atomic unit --> eV)
+
+        static constexpr float K1 = 16.0;              // global ad hoc parameters
+        static constexpr float K3 = -4.0;              // global ad hoc parameters
+        /*--------- Constants ---------*/
+
+        /*--------- Parameters to read ---------*/
+        int damping;
+        std::string functional;
+        float* r2r4 = nullptr;             // scale r4/r2 values of the atoms by sqrt(Z)
+        float* rcov = nullptr;             // covalent radii
+        int* mxc = nullptr;                // How large the grid for c6 interpolation
+        float** r0ab = nullptr;            // cut-off radii for all element pairs
+        float***** c6ab = nullptr;         // C6 for all element pairs
+        float rthr;                        // R^2 distance to cutoff for C calculation
+        float cnthr;                       // R^2 distance to cutoff for CN_calculation
+        float s6, s8, s18, rs6, rs8, rs18, alp, alp6, alp8, a1, a2; // parameters for D3
+        /*--------- Parameters to read ---------*/
+
+        /*--------- Lattice related values ---------*/
+        double* lat_v_1 = nullptr;           // lattice coordination vector
+        double* lat_v_2 = nullptr;           // lattice coordination vector
+        double* lat_v_3 = nullptr;           // lattice coordination vector
+        int* rep_vdw = nullptr;              // repetition of cell for calculating D3
+        int* rep_cn = nullptr;               // repetition of cell for calculating
+        double** sigma = nullptr;            // virial pressure on cell
+        /*--------- Lattice related values ---------*/
+
+        /*--------- Per-atom values/arrays ---------*/
+        double* cn = nullptr;               // Coordination numbers
+        float** x = nullptr;                // Positions
+        double** f = nullptr;               // Forces
+        double* dc6i = nullptr;             // dC6i(iat) saves dE_dsp/dCN(iat)
+        /*--------- Per-atom values/arrays ---------*/
+
+        /*--------- Per-pair values/arrays ---------*/
+        float* c6_ij_tot = nullptr;
+        float* dc6_iji_tot = nullptr;
+        float* dc6_ijj_tot = nullptr;
+        /*--------- Per-pair values/arrays ---------*/
+
+        /*---------- Global values ---------*/
+        int n_save;                         // to check whether the number of atoms has changed
+        float disp_total;                   // Dispersion energy
+        /*---------- Global values ---------*/
+
+        /*--------- For loop over tau (translation of cell) ---------*/
+        float**** tau_vdw = nullptr;
+        float**** tau_cn = nullptr;
+        int* tau_idx_vdw = nullptr;
+        int* tau_idx_cn = nullptr;
+        int tau_idx_vdw_total_size;
+        int tau_idx_cn_total_size;
+        /*--------- For loop over tau (translation of cell) ---------*/
+
+        /*--------- For cuda memory transfer (pointerized) ---------*/
+        int *atomtype;
+        double *disp;
+        /*--------- For cuda memory transfer (pointerized) ---------*/
+    };
+}
+
+#endif // LMP_PAIR_D3
+#endif // PAIR_CLASS
--- a/mace-bench/3rdparty/SevenNet/sevenn/pair_e3gnn/pair_d3_for_ase.cu
+++ b/mace-bench/3rdparty/SevenNet/sevenn/pair_e3gnn/pair_d3_for_ase.cu
+/*
+This code is a skeleton of the LAMMPS pair_style d3 accelerated by CUDA.
+All dependencies on LAMMPS have been removed.
+The input and output variables are named based on the LAMMPS variables as much as possible.
+*/
+
+#include "pair_d3_for_ase.h"
+
+/* --------- Macros for CUDA error handling --------- */
+#define START_CUDA_TIMER()    \
+    cudaEvent_t start, stop;  \
+    cudaEventCreate(&start);  \
+    cudaEventCreate(&stop);   \
+    cudaEventRecord(start);
+
+#define STOP_CUDA_TIMER(tag)                           \
+    cudaEventRecord(stop);                             \
+    cudaEventSynchronize(stop);                        \
+    float msec = 0;                                    \
+    cudaEventElapsedTime(&msec, start, stop);          \
+    printf("Elapsed time for %s: %f ms\n", tag, msec); \
+    cudaEventDestroy(start);                           \
+    cudaEventDestroy(stop);
+
+#define CHECK_CUDA(call) do {                                            \
+    cudaError_t status_ = call;                                          \
+    if (status_ != cudaSuccess) {                                        \
+      fprintf(stderr, "CUDA Error (%s:%d) -> %s: %s\n", __FILE__, __LINE__, \
+              cudaGetErrorName(status_), cudaGetErrorString(status_));   \
+      exit(EXIT_FAILURE);                                                \
+    }                                                                    \
+} while (0)
+
+#define CHECK_CUDA_ERROR() do {                                          \
+    cudaDeviceSynchronize();                                             \
+    cudaError_t status_ = cudaGetLastError();                            \
+    if (status_ != cudaSuccess) {                                        \
+      fprintf(stderr, "CUDA Error (%s:%d) -> %s: %s\n", __FILE__, __LINE__, \
+              cudaGetErrorName(status_), cudaGetErrorString(status_));   \
+      exit(EXIT_FAILURE);                                                \
+    }                                                                    \
+} while (0)
+
+#define CHECK_CUDA_DEVICES() do {                                              \
+    int deviceCount = 0;                                                       \
+    if (cudaGetDeviceCount(&deviceCount) != cudaSuccess || deviceCount == 0) { \
+        fprintf(stderr, "CUDA Error (%s:%d) -> No CUDA devices found\n",         \
+                __FILE__, __LINE__);                                           \
+        exit(EXIT_FAILURE);                                                    \
+    }                                                                          \
+} while(0)
+/* --------- Macros for CUDA error handling --------- */
+
+/* --------- Math functions for CUDA compatibility --------- */
+inline __host__ __device__ void ij_at_linij(int linij, int &i, int &j) {
+    i = static_cast<int>((sqrt(1 + 8 * linij) - 1) / 2);
+    j = linij - i * (i + 1) / 2;
+} // unroll the triangular loop
+
+inline __host__ __device__ float lensq3(const float *v)
+{
+  return v[0] * v[0] + v[1] * v[1] + v[2] * v[2];
+} // from MathExtra::lensq3
+
+inline void cross3(const double *v1, const double *v2, double *ans)
+{
+  ans[0] = v1[1] * v2[2] - v1[2] * v2[1];
+  ans[1] = v1[2] * v2[0] - v1[0] * v2[2];
+  ans[2] = v1[0] * v2[1] - v1[1] * v2[0];
+}
+
+inline double dot3(const double *v1, const double *v2)
+{
+  return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2];
+}
+
+inline double len3(const double *v)
+{
+  return sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+}
+/* --------- Math functions for CUDA compatibility --------- */
+
+/* ----------------------------------------------------------------------
+   Constructor (Required)
+------------------------------------------------------------------------- */
+
+PairD3::PairD3() {
+    allocated = 0;
+}
+
+/* ----------------------------------------------------------------------
+   Destructor (Required)
+------------------------------------------------------------------------- */
+
+PairD3::~PairD3() {
+    if (allocated) {
+        int n = atom->natoms;
+        int np1 = atom->ntypes + 1;
+        int vdw_range_x = 2 * rep_vdw[0] + 1;
+        int vdw_range_y = 2 * rep_vdw[1] + 1;
+        int vdw_range_z = 2 * rep_vdw[2] + 1;
+        int cn_range_x  = 2 * rep_cn[0] + 1;
+        int cn_range_y  = 2 * rep_cn[1] + 1;
+        int cn_range_z  = 2 * rep_cn[2] + 1;
+
+        //for (int i = 0; i < np1; i++) { cudaFree(setflag[i]); }; cudaFree(setflag);
+        //for (int i = 0; i < np1; i++) { cudaFree(cutsq[i]); }; cudaFree(cutsq);
+        cudaFree(r2r4);
+        cudaFree(rcov);
+        cudaFree(mxc);
+        for (int i = 0; i < np1; i++) { cudaFree(r0ab[i]); }; cudaFree(r0ab);
+        for (int i = 0; i < np1; i++) {
+            for (int j = 0; j < np1; j++) {
+                for (int k = 0; k < MAXC; k++) {
+                    for (int l = 0; l < MAXC; l++) {
+                        cudaFree(c6ab[i][j][k][l]);
+                    }
+                    cudaFree(c6ab[i][j][k]);
+                }
+                cudaFree(c6ab[i][j]);
+            }
+            cudaFree(c6ab[i]);
+        }
+        cudaFree(c6ab);
+
+        cudaFree(lat_v_1);
+        cudaFree(lat_v_2);
+        cudaFree(lat_v_3);
+
+        cudaFree(rep_vdw);
+        cudaFree(rep_cn);
+        cudaFree(cn);
+        for (int i = 0; i < n; i++) { cudaFree(x[i]); }; cudaFree(x);
+
+        cudaFree(dc6i);
+        for (int i = 0; i < n; i++) { cudaFree(f[i]); }; cudaFree(f);
+
+        for (int i = 0; i < 3; i++) { cudaFree(sigma[i]); }; cudaFree(sigma);
+
+        cudaFree(dc6_iji_tot);
+        cudaFree(dc6_ijj_tot);
+        cudaFree(c6_ij_tot);
+
+        for (int i = 0; i < vdw_range_x; i++) {
+            for (int j = 0; j < vdw_range_y; j++) {
+                for (int k = 0; k < vdw_range_z; k++) {
+                    cudaFree(tau_vdw[i][j][k]);
+                }
+                cudaFree(tau_vdw[i][j]);
+            }
+            cudaFree(tau_vdw[i]);
+        }
+        cudaFree(tau_vdw);
+        for (int i = 0; i < cn_range_x; i++) {
+            for (int j = 0; j < cn_range_y; j++) {
+                for (int k = 0; k < cn_range_z; k++) {
+                    cudaFree(tau_cn[i][j][k]);
+                }
+                cudaFree(tau_cn[i][j]);
+            }
+            cudaFree(tau_cn[i]);
+        }
+        cudaFree(tau_cn);
+
+        cudaFree(tau_idx_vdw);
+        cudaFree(tau_idx_cn);
+
+        cudaFree(atomtype);
+        cudaFree(disp);
+    }
+}
+
+/* ----------------------------------------------------------------------
+   Allocate all arrays (Required)
+------------------------------------------------------------------------- */
+
+void PairD3::allocate() {
+    CHECK_CUDA_DEVICES();
+    allocated = 1;
+
+    /* atom->ntypes : # of elements; element index starts from 1 */
+    int n = atom->natoms;
+    int np1 = atom->ntypes + 1;
+    n_save = n;
+    np1_save = np1;
+
+    //cudaMallocManaged(&setflag, np1 * sizeof(int*)); for (int i = 0; i < np1; i++) { cudaMallocManaged(&setflag[i], np1 * sizeof(int)); }
+    //cudaMallocManaged(&cutsq, np1 * sizeof(double*)); for (int i = 0; i < np1; i++) { cudaMallocManaged(&cutsq[i], np1 * sizeof(double)); }
+    cudaMallocManaged(&r2r4, np1 * sizeof(float));
+    cudaMallocManaged(&rcov, np1 * sizeof(float));
+    cudaMallocManaged(&mxc, np1 * sizeof(int));
+    cudaMallocManaged(&r0ab, np1 * sizeof(float*)); for (int i = 0; i < np1; i++) { cudaMallocManaged(&r0ab[i], np1 * sizeof(float)); }
+    cudaMallocManaged(&c6ab, np1 * sizeof(float****));
+    for (int i = 0; i < np1; i++) {
+        cudaMallocManaged(&c6ab[i], np1 * sizeof(float***));
+        for (int j = 0; j < np1; j++) {
+            cudaMallocManaged(&c6ab[i][j], MAXC * sizeof(float**));
+            for (int k = 0; k < MAXC; k++) {
+                cudaMallocManaged(&c6ab[i][j][k], MAXC * sizeof(float*));
+                for (int l = 0; l < MAXC; l++) {
+                    cudaMallocManaged(&c6ab[i][j][k][l], 3 * sizeof(float));
+                }
+            }
+        }
+    }
+
+    cudaMallocManaged(&lat_v_1, 3 * sizeof(float));
+    cudaMallocManaged(&lat_v_2, 3 * sizeof(float));
+    cudaMallocManaged(&lat_v_3, 3 * sizeof(float));
+    cudaMallocManaged(&rep_vdw, 3 * sizeof(int));
+    cudaMallocManaged(&rep_cn,  3 * sizeof(int));
+    cudaMallocManaged(&sigma,   3 * sizeof(double*)); for (int i = 0; i < 3; i++) { cudaMallocManaged(&sigma[i], 3 * sizeof(double)); }
+
+    cudaMallocManaged(&cn, n * sizeof(double));
+    cudaMallocManaged(&x, n * sizeof(float*)); for (int i = 0; i < n; i++) { cudaMallocManaged(&x[i], 3 * sizeof(float)); }
+    cudaMallocManaged(&dc6i, n * sizeof(double));
+    cudaMallocManaged(&f, n * sizeof(double*)); for (int i = 0; i < n; i++) { cudaMallocManaged(&f[i], 3 * sizeof(double)); }
+
+    // Initialization
+    // Initialize for lattice -> set_lattice_vectors()
+    tau_idx_vdw_total_size = -1;
+    tau_idx_cn_total_size = -1;
+    for (int i = 0; i < 3; i++) {
+        rep_vdw[i] = -1;
+        rep_cn[i] = -1;
+    }
+
+    //for (int i = 1; i < np1; i++) {
+    //    for (int j = 1; j < np1; j++) {
+    //        setflag[i][j] = 0;
+    //    }
+    //}
+
+    for (int idx1 = 0; idx1 < np1; idx1++) {
+        for (int idx2 = 0; idx2 < np1; idx2++) {
+            for (int idx3 = 0; idx3 < MAXC; idx3++) {
+                for (int idx4 = 0; idx4 < MAXC; idx4++) {
+                    for (int idx5 = 0; idx5 < 3; idx5++) {
+                        c6ab[idx1][idx2][idx3][idx4][idx5] = -1;
+                    }
+                }
+            }
+        }
+    }
+
+    int n_ij_combination = n * (n + 1) / 2;
+    cudaMallocManaged(&dc6_iji_tot, n_ij_combination * sizeof(float));
+    cudaMallocManaged(&dc6_ijj_tot, n_ij_combination * sizeof(float));
+    cudaMallocManaged(&c6_ij_tot,   n_ij_combination * sizeof(float));
+
+    cudaMallocManaged(&atomtype, n * sizeof(int));
+    cudaMallocManaged(&disp, sizeof(double));
+}
+
+/* ----------------------------------------------------------------------
+   Settings : read from pair_style (Required) -> pair_style d3 vdw_sq cn_sq damp_name func_name
+------------------------------------------------------------------------- */
+
+void PairD3::settings(double vdw_sq, double cn_sq, std::string damp_name, std::string func_name) {
+    rthr = vdw_sq;
+    cnthr = cn_sq;
+
+    std::map<std::string, int> commandMap = {
+        {"damp_zero", 0}, {"damp_bj", 1}, {"damp_zerom", 2}, {"damp_bjm", 3},
+    };
+
+    if (commandMap.find(damp_name) == commandMap.end()) {
+        error->all(FLERR, "Unknown damping function");
+    }
+
+    damping = commandMap[damp_name];
+    functional = func_name;
+
+    setfuncpar();
+}
+
+/* ----------------------------------------------------------------------
+   finds atomic number (used in PairD3::coeff)
+------------------------------------------------------------------------- */
+
+int PairD3::find_atomic_number(std::string& key) {
+    std::transform(key.begin(), key.end(), key.begin(), ::tolower);
+    if (key.length() == 1) { key += " "; }
+    key.resize(2);
+
+    std::vector<std::string> element_table = {
+        "h ","he",
+        "li","be","b ","c ","n ","o ","f ","ne",
+        "na","mg","al","si","p ","s ","cl","ar",
+        "k ","ca","sc","ti","v ","cr","mn","fe","co","ni","cu",
+        "zn","ga","ge","as","se","br","kr",
+        "rb","sr","y ","zr","nb","mo","tc","ru","rh","pd","ag",
+        "cd","in","sn","sb","te","i ","xe",
+        "cs","ba","la","ce","pr","nd","pm","sm","eu","gd","tb","dy",
+        "ho","er","tm","yb","lu","hf","ta","w ","re","os","ir","pt",
+        "au","hg","tl","pb","bi","po","at","rn",
+        "fr","ra","ac","th","pa","u ","np","pu"
+    };
+
+    for (size_t i = 0; i < element_table.size(); ++i) {
+        if (element_table[i] == key) {
+            int atomic_number = i + 1;
+            return atomic_number;
+        }
+    }
+
+    // if not the case
+    return -1;
+}
+
+/* ----------------------------------------------------------------------
+   Check whether an integer value in an integer array (used in PairD3::coeff)
+------------------------------------------------------------------------- */
+
+int PairD3::is_int_in_array(int arr[], int size, int value) {
+    for (int i = 0; i < size; i++) {
+        if (arr[i] == value) { return i; } // returns the index
+    }
+    return -1;
+}
+
+/* ----------------------------------------------------------------------
+   Read r0ab values from the table (used in PairD3::coeff)
+------------------------------------------------------------------------- */
+
+void PairD3::read_r0ab(int* atomic_numbers, int ntypes) {
+    const double r0ab_table[94][94] = R0AB_TABLE;
+
+    for (int i = 1; i <= ntypes; i++) {
+        for (int j = 1; j <= ntypes; j++) {
+            r0ab[i][j] = r0ab_table[atomic_numbers[i-1]-1][atomic_numbers[j-1]-1] / AU_TO_ANG;
+        }
+    }
+}
+
+/* ----------------------------------------------------------------------
+   Get atom pair indices and grid indices (used in PairD3::read_c6ab)
+------------------------------------------------------------------------- */
+
+void PairD3::get_limit_in_pars_array(int& idx_atom_1, int& idx_atom_2, int& idx_i, int& idx_j) {
+    const int shift = 100;
+
+    idx_i = (idx_atom_1 - 1) / shift + 1;
+    idx_j = (idx_atom_2 - 1) / shift + 1;
+
+    idx_atom_1 = (idx_atom_1 - 1) % shift + 1;
+    idx_atom_2 = (idx_atom_2 - 1) % shift + 1;
+
+    // the code above replaces the code below
+    //idx_i = 1;
+    //idx_j = 1;
+    //int shift = 100;
+    //while (idx_atom_1 > shift) { idx_atom_1 -= shift; idx_i++; }
+    //while (idx_atom_2 > shift) { idx_atom_2 -= shift; idx_j++; }
+}
+
+/* ----------------------------------------------------------------------
+   Read c6ab values from the table (used in PairD3::coeff)
+------------------------------------------------------------------------- */
+
+void PairD3::read_c6ab(int* atomic_numbers, int ntypes) {
+    for (int i = 1; i <= ntypes; i++) { mxc[i] = 0; }
+    int grid_i = 0, grid_j = 0;
+
+    const double c6ab_table[32385][5] = C6AB_TABLE;
+
+    for (int i = 0; i < 32385; i++) {
+        const double ref_c6 = c6ab_table[i][0];
+        int atom_number_1 = static_cast<int>(c6ab_table[i][1]);
+        int atom_number_2 = static_cast<int>(c6ab_table[i][2]);
+        get_limit_in_pars_array(atom_number_1, atom_number_2, grid_i, grid_j);
+        const int idx_atom_1 = is_int_in_array(atomic_numbers, ntypes, atom_number_1);
+        if (idx_atom_1 < 0) { continue; }
+        const int idx_atom_2 = is_int_in_array(atomic_numbers, ntypes, atom_number_2);
+        if (idx_atom_2 < 0) { continue; }
+        const double ref_cn1 = c6ab_table[i][3];
+        const double ref_cn2 = c6ab_table[i][4];
+
+        mxc[idx_atom_1 + 1] = std::max(mxc[idx_atom_1 + 1], grid_i);
+        mxc[idx_atom_2 + 1] = std::max(mxc[idx_atom_2 + 1], grid_j);
+        c6ab[idx_atom_1 + 1][idx_atom_2 + 1][grid_i - 1][grid_j - 1][0] = ref_c6;
+        c6ab[idx_atom_1 + 1][idx_atom_2 + 1][grid_i - 1][grid_j - 1][1] = ref_cn1;
+        c6ab[idx_atom_1 + 1][idx_atom_2 + 1][grid_i - 1][grid_j - 1][2] = ref_cn2;
+        c6ab[idx_atom_2 + 1][idx_atom_1 + 1][grid_j - 1][grid_i - 1][0] = ref_c6;
+        c6ab[idx_atom_2 + 1][idx_atom_1 + 1][grid_j - 1][grid_i - 1][1] = ref_cn2;
+        c6ab[idx_atom_2 + 1][idx_atom_1 + 1][grid_j - 1][grid_i - 1][2] = ref_cn1;
+    }
+}
+
+/* ----------------------------------------------------------------------
+   Set functional parameters (used in PairD3::coeff)
+------------------------------------------------------------------------- */
+
+void PairD3::setfuncpar_zero() {
+    s6 = 1.0;
+    alp = 14.0;
+    rs18 = 1.0;
+
+    // default def2-QZVP (almost basis set limit)
+    std::unordered_map<std::string, int> commandMap = {
+    { "slater-dirac-exchange", 1}, { "b-lyp", 2 },    { "b-p", 3 },       { "b97-d", 4 },      { "revpbe", 5 },
+    { "pbe", 6 },                  { "pbesol", 7 },   { "rpw86-pbe", 8 }, { "rpbe", 9 },       { "tpss", 10 },
+    { "b3-lyp", 11 },              { "pbe0", 12 },    { "hse06", 13 },    { "revpbe38", 14 },  { "pw6b95", 15 },
+    { "tpss0", 16 },               { "b2-plyp", 17 }, { "pwpb95", 18 },   { "b2gp-plyp", 19 }, { "ptpss", 20 },
+    { "hf", 21 },                  { "mpwlyp", 22 },  { "bpbe", 23 },     { "bh-lyp", 24 },    { "tpssh", 25 },
+    { "pwb6k", 26 },               { "b1b95", 27 },   { "bop", 28 },      { "o-lyp", 29 },     { "o-pbe", 30 },
+    { "ssb", 31 },                 { "revssb", 32 },  { "otpss", 33 },    { "b3pw91", 34 },    { "revpbe0", 35 },
+    { "pbe38", 36 },               { "mpw1b95", 37 }, { "mpwb1k", 38 },   { "bmk", 39 },       { "cam-b3lyp", 40 },
+    { "lc-wpbe", 41 },             { "m05", 42 },     { "m052x", 43 },    { "m06l", 44 },      { "m06", 45 },
+    { "m062x", 46 },               { "m06hf", 47 },   { "hcth120", 48 }
+    };
+
+    int commandCode = commandMap[functional];
+    switch (commandCode) {
+    case 1: rs6 = 0.999; s18 = -1.957; rs18 = 0.697; break;
+    case 2: rs6 = 1.094; s18 = 1.682; break;
+    case 3: rs6 = 1.139; s18 = 1.683; break;
+    case 4: rs6 = 0.892; s18 = 0.909; break;
+    case 5: rs6 = 0.923; s18 = 1.010; break;
+    case 6: rs6 = 1.217; s18 = 0.722; break;
+    case 7: rs6 = 1.345; s18 = 0.612; break;
+    case 8: rs6 = 1.224; s18 = 0.901; break;
+    case 9: rs6 = 0.872; s18 = 0.514; break;
+    case 10: rs6 = 1.166; s18 = 1.105; break;
+    case 11: rs6 = 1.261; s18 = 1.703; break;
+    case 12: rs6 = 1.287; s18 = 0.928; break;
+    case 13: rs6 = 1.129; s18 = 0.109; break;
+    case 14: rs6 = 1.021; s18 = 0.862; break;
+    case 15: rs6 = 1.532; s18 = 0.862; break;
+    case 16: rs6 = 1.252; s18 = 1.242; break;
+    case 17: rs6 = 1.427; s18 = 1.022; s6 = 0.64; break;
+    case 18: rs6 = 1.557; s18 = 0.705; s6 = 0.82; break;
+    case 19: rs6 = 1.586; s18 = 0.760; s6 = 0.56; break;
+    case 20: rs6 = 1.541; s18 = 0.879; s6 = 0.75; break;
+    case 21: rs6 = 1.158; s18 = 1.746; break;
+    case 22: rs6 = 1.239; s18 = 1.098; break;
+    case 23: rs6 = 1.087; s18 = 2.033; break;
+    case 24: rs6 = 1.370; s18 = 1.442; break;
+    case 25: rs6 = 1.223; s18 = 1.219; break;
+    case 26: rs6 = 1.660; s18 = 0.550; break;
+    case 27: rs6 = 1.613; s18 = 1.868; break;
+    case 28: rs6 = 0.929; s18 = 1.975; break;
+    case 29: rs6 = 0.806; s18 = 1.764; break;
+    case 30: rs6 = 0.837; s18 = 2.055; break;
+    case 31: rs6 = 1.215; s18 = 0.663; break;
+    case 32: rs6 = 1.221; s18 = 0.560; break;
+    case 33: rs6 = 1.128; s18 = 1.494; break;
+    case 34: rs6 = 1.176; s18 = 1.775; break;
+    case 35: rs6 = 0.949; s18 = 0.792; break;
+    case 36: rs6 = 1.333; s18 = 0.998; break;
+    case 37: rs6 = 1.605; s18 = 1.118; break;
+    case 38: rs6 = 1.671; s18 = 1.061; break;
+    case 39: rs6 = 1.931; s18 = 2.168; break;
+    case 40: rs6 = 1.378; s18 = 1.217; break;
+    case 41: rs6 = 1.355; s18 = 1.279; break;
+    case 42: rs6 = 1.373; s18 = 0.595; break;
+    case 43: rs6 = 1.417; s18 = 0.000; break;
+    case 44: rs6 = 1.581; s18 = 0.000; break;
+    case 45: rs6 = 1.325; s18 = 0.000; break;
+    case 46: rs6 = 1.619; s18 = 0.000; break;
+    case 47: rs6 = 1.446; s18 = 0.000; break;
+    /* DFTB3(zeta = 4.0), old deprecated parameters; case ("dftb3"); rs6 = 1.235; s18 = 0.673; */
+    case 48: rs6 = 1.221; s18 = 1.206; break;
+    default:
+        error->all(FLERR, "Functional name unknown");
+        break;
+    }
+}
+
+void PairD3::setfuncpar_bj() {
+    s6 = 1.0;
+    alp = 14.0;
+
+    std::unordered_map<std::string, int> commandMap = {
+        {"b-p", 1}, {"b-lyp", 2}, {"revpbe", 3}, {"rpbe", 4}, {"b97-d", 5}, {"pbe", 6},
+        {"rpw86-pbe", 7}, {"b3-lyp", 8}, {"tpss", 9}, {"hf", 10}, {"tpss0", 11}, {"pbe0", 12},
+        {"hse06", 13}, {"revpbe38", 14}, {"pw6b95", 15}, {"b2-plyp", 16}, {"dsd-blyp", 17},
+        {"dsd-blyp-fc", 18}, {"bop", 19}, {"mpwlyp", 20}, {"o-lyp", 21}, {"pbesol", 22}, {"bpbe", 23},
+        {"opbe", 24}, {"ssb", 25}, {"revssb", 26}, {"otpss", 27}, {"b3pw91", 28}, {"bh-lyp", 29},
+        {"revpbe0", 30}, {"tpssh", 31}, {"mpw1b95", 32}, {"pwb6k", 33}, {"b1b95", 34}, {"bmk", 35},
+        {"cam-b3lyp", 36}, {"lc-wpbe", 37}, {"b2gp-plyp", 38}, {"ptpss", 39}, {"pwpb95", 40},
+        {"hf/mixed", 41}, {"hf/sv", 42}, {"hf/minis", 43}, {"b3-lyp/6-31gd", 44}, {"hcth120", 45},
+        {"pw1pw", 46}, {"pwgga", 47}, {"hsesol", 48}, {"hf3c", 49}, {"hf3cv", 50}, {"pbeh3c", 51},
+        {"pbeh-3c", 52}, {"wb97m", 53}
+    };
+
+    int commandCode = commandMap[functional];
+    switch (commandCode) {
+        case 1: rs6 = 0.3946; s18 = 3.2822; rs18 = 4.8516; break;
+        case 2: rs6 = 0.4298; s18 = 2.6996; rs18 = 4.2359; break;
+        case 3: rs6 = 0.5238; s18 = 2.3550; rs18 = 3.5016; break;
+        case 4: rs6 = 0.1820; s18 = 0.8318; rs18 = 4.0094; break;
+        case 5: rs6 = 0.5545; s18 = 2.2609; rs18 = 3.2297; break;
+        case 6: rs6 = 0.4289; s18 = 0.7875; rs18 = 4.4407; break;
+        case 7: rs6 = 0.4613; s18 = 1.3845; rs18 = 4.5062; break;
+        case 8: rs6 = 0.3981; s18 = 1.9889; rs18 = 4.4211; break;
+        case 9: rs6 = 0.4535; s18 = 1.9435; rs18 = 4.4752; break;
+        case 10: rs6 = 0.3385; s18 = 0.9171; rs18 = 2.8830; break;
+        case 11: rs6 = 0.3768; s18 = 1.2576; rs18 = 4.5865; break;
+        case 12: rs6 = 0.4145; s18 = 1.2177; rs18 = 4.8593; break;
+        case 13: rs6 = 0.383; s18 = 2.310; rs18 = 5.685; break;
+        case 14: rs6 = 0.4309; s18 = 1.4760; rs18 = 3.9446; break;
+        case 15: rs6 = 0.2076; s18 = 0.7257; rs18 = 6.3750; break;
+        case 16: rs6 = 0.3065; s18 = 0.9147; rs18 = 5.0570; break; s6 = 0.64;
+        case 17: rs6 = 0.0000; s18 = 0.2130; rs18 = 6.0519; s6 = 0.50; break;
+        case 18: rs6 = 0.0009; s18 = 0.2112; rs18 = 5.9807; s6 = 0.50; break;
+        case 19: rs6 = 0.4870; s18 = 3.2950; rs18 = 3.5043; break;
+        case 20: rs6 = 0.4831; s18 = 2.0077; rs18 = 4.5323; break;
+        case 21: rs6 = 0.5299; s18 = 2.6205; rs18 = 2.8065; break;
+        case 22: rs6 = 0.4466; s18 = 2.9491; rs18 = 6.1742; break;
+        case 23: rs6 = 0.4567; s18 = 4.0728; rs18 = 4.3908; break;
+        case 24: rs6 = 0.5512; s18 = 3.3816; rs18 = 2.9444; break;
+        case 25: rs6 = -0.0952; s18 = -0.1744; rs18 = 5.2170; break;
+        case 26: rs6 = 0.4720; s18 = 0.4389; rs18 = 4.0986; break;
+        case 27: rs6 = 0.4634; s18 = 2.7495; rs18 = 4.3153; break;
+        case 28: rs6 = 0.4312; s18 = 2.8524; rs18 = 4.4693; break;
+        case 29: rs6 = 0.2793; s18 = 1.0354; rs18 = 4.9615; break;
+        case 30: rs6 = 0.4679; s18 = 1.7588; rs18 = 3.7619; break;
+        case 31: rs6 = 0.4529; s18 = 2.2382; rs18 = 4.6550; break;
+        case 32: rs6 = 0.1955; s18 = 1.0508; rs18 = 6.4177; break;
+        case 33: rs6 = 0.1805; s18 = 0.9383; rs18 = 7.7627; break;
+        case 34: rs6 = 0.2092; s18 = 1.4507; rs18 = 5.5545; break;
+        case 35: rs6 = 0.1940; s18 = 2.0860; rs18 = 5.9197; break;
+        case 36: rs6 = 0.3708; s18 = 2.0674; rs18 = 5.4743; break;
+        case 37: rs6 = 0.3919; s18 = 1.8541; rs18 = 5.0897; break;
+        case 38: rs6 = 0.0000; s18 = 0.2597; rs18 = 6.3332; s6 = 0.560; break;
+        case 39: rs6 = 0.0000; s18 = 0.2804; rs18 = 6.5745; s6 = 0.750; break;
+        case 40: rs6 = 0.0000; s18 = 0.2904; rs18 = 7.3141; s6 = 0.820; break;
+        // special HF / DFT with eBSSE correction;
+        case 41: rs6 = 0.5607; s18 = 3.9027; rs18 = 4.5622; break;
+        case 42: rs6 = 0.4249; s18 = 2.1849; rs18 = 4.2783; break;
+        case 43: rs6 = 0.1702; s18 = 0.9841; rs18 = 3.8506; break;
+        case 44: rs6 = 0.5014; s18 = 4.0672; rs18 = 4.8409; break;
+        case 45: rs6 = 0.3563; s18 = 1.0821; rs18 = 4.3359; break;
+        /*     DFTB3 old, deprecated parameters : ;
+            *     case ("dftb3"); rs6 = 0.7461; s18 = 3.209; rs18 = 4.1906;
+            *     special SCC - DFTB parametrization;
+            *     full third order DFTB, self consistent charges, hydrogen pair damping with; exponent 4.2;
+        */
+        case 46: rs6 = 0.3807; s18 = 2.3363; rs18 = 5.8844; break;
+        case 47: rs6 = 0.2211; s18 = 2.6910; rs18 = 6.7278; break;
+        case 48: rs6 = 0.4650; s18 = 2.9215; rs18 = 6.2003; break;
+        // special HF - D3 - gCP - SRB / MINIX parametrization;
+        case 49: rs6 = 0.4171; s18 = 0.8777; rs18 = 2.9149; break;
+        // special HF - D3 - gCP - SRB2 / ECP - 2G parametrization;
+        case 50: rs6 = 0.3063; s18 = 0.5022; rs18 = 3.9856; break;
+        // special PBEh - D3 - gCP / def2 - mSVP parametrization;
+        case 51: rs6 = 0.4860; s18 = 0.0000; rs18 = 4.5000; break;
+        case 52: rs6 = 0.4860; s18 = 0.0000; rs18 = 4.5000; break;
+        case 53: rs6 = 0.5660; s18 = 0.3908; rs18 = 3.1280; break;
+        default:
+            error->all(FLERR, "Functional name unknown");
+            break;
+    }
+}
+
+void PairD3::setfuncpar_zerom() {
+    s6 = 1.0;
+    alp = 14.0;
+
+    std::unordered_map<std::string, int> commandMap = {
+        {"b2-plyp", 1}, {"b3-lyp", 2}, {"b97-d", 3}, {"b-lyp", 4},
+        {"b-p", 5}, {"pbe", 6}, {"pbe0", 7}, {"lc-wpbe", 8}
+    };
+
+    int commandCode = commandMap[functional];
+    switch (commandCode) {
+        case 1: rs6 = 1.313134; s18 = 0.717543; rs18 = 0.016035; s6 = 0.640000; break;
+        case 2: rs6 = 1.338153; s18 = 1.532981; rs18 = 0.013988; break;
+        case 3: rs6 = 1.151808; s18 = 1.020078; rs18 = 0.035964; break;
+        case 4: rs6 = 1.279637; s18 = 1.841686; rs18 = 0.014370; break;
+        case 5: rs6 = 1.233460; s18 = 1.945174; rs18 = 0.000000; break;
+        case 6: rs6 = 2.340218; s18 = 0.000000; rs18 = 0.129434; break;
+        case 7: rs6 = 2.077949; s18 = 0.000081; rs18 = 0.116755; break;
+        case 8: rs6 = 1.366361; s18 = 1.280619; rs18 = 0.003160; break;
+        default:
+            error->all(FLERR, "Functional name unknown");
+            break;
+    }
+}
+
+void PairD3::setfuncpar_bjm() {
+    s6 = 1.0;
+    alp = 14.0;
+
+    std::unordered_map<std::string, int> commandMap = {
+        {"b2-plyp", 1}, {"b3-lyp", 2}, {"b97-d", 3}, {"b-lyp", 4},
+        {"b-p", 5}, {"pbe", 6}, {"pbe0", 7}, {"lc-wpbe", 8}
+    };
+
+    int commandCode = commandMap[functional];
+    switch (commandCode) {
+        case 1: rs6 = 0.486434; s18 = 0.672820; rs18 = 3.656466; s6 = 0.640000; break;
+        case 2: rs6 = 0.278672; s18 = 1.466677; rs18 = 4.606311; break;
+        case 3: rs6 = 0.240184; s18 = 1.206988; rs18 = 3.864426; break;
+        case 4: rs6 = 0.448486; s18 = 1.875007; rs18 = 3.610679; break;
+        case 5: rs6 = 0.821850; s18 = 3.140281; rs18 = 2.728151; break;
+        case 6: rs6 = 0.012092; s18 = 0.358940; rs18 = 5.938951; break;
+        case 7: rs6 = 0.007912; s18 = 0.528823; rs18 = 6.162326; break;
+        case 8: rs6 = 0.563761; s18 = 0.906564; rs18 = 3.593680; break;
+        default:
+            error->all(FLERR, "Functional name unknown");
+            break;
+    }
+}
+
+void PairD3::setfuncpar() {
+    void (PairD3::*setfuncpar_damp[4])() = {
+        &PairD3::setfuncpar_zero,
+        &PairD3::setfuncpar_bj,
+        &PairD3::setfuncpar_zerom,
+        &PairD3::setfuncpar_bjm
+    };
+    (this->*setfuncpar_damp[damping])();
+
+    rs8 = rs18;
+    alp6 = alp;
+    alp8 = alp + 2.0;
+    // rs10 = rs18
+    // alp10 = alp + 4.0;
+
+    a1 = rs6;
+    a2 = rs8;
+    s8 = s18;
+    // s6 is already defined
+}
+
+/* ----------------------------------------------------------------------
+   Coeff : read from pair_coeff (Required) -> pair_coeff * * element1 element2 ...
+------------------------------------------------------------------------- */
+
+void PairD3::coeff(int* atomic_numbers) {
+    if (!allocated) allocate();
+    if (atom->ntypes + 1 != np1_save) { reallocate_arrays_np1(); }
+
+    int ntypes = atom->ntypes;
+    /*
+    scale r4/r2 values of the atoms by sqrt(Z)
+    sqrt is also globally close to optimum
+    together with the factor 1/2 this yield reasonable
+    c8 for he, ne and ar. for larger Z, C8 becomes too large
+    which effectively mimics higher R^n terms neglected due
+    to stability reasons
+
+    r2r4 =sqrt(0.5*r2r4(i)*dfloat(i)**0.5 ) with i=elementnumber
+    the large number of digits is just to keep the results consistent
+    with older versions. They should not imply any higher accuracy than
+    the old values
+    */
+    double r2r4_ref[94] = {
+         2.00734898,  1.56637132,  5.01986934,  3.85379032,  3.64446594,
+         3.10492822,  2.71175247,  2.59361680,  2.38825250,  2.21522516,
+         6.58585536,  5.46295967,  5.65216669,  4.88284902,  4.29727576,
+         4.04108902,  3.72932356,  3.44677275,  7.97762753,  7.07623947,
+         6.60844053,  6.28791364,  6.07728703,  5.54643096,  5.80491167,
+         5.58415602,  5.41374528,  5.28497229,  5.22592821,  5.09817141,
+         6.12149689,  5.54083734,  5.06696878,  4.87005108,  4.59089647,
+         4.31176304,  9.55461698,  8.67396077,  7.97210197,  7.43439917,
+         6.58711862,  6.19536215,  6.01517290,  5.81623410,  5.65710424,
+         5.52640661,  5.44263305,  5.58285373,  7.02081898,  6.46815523,
+         5.98089120,  5.81686657,  5.53321815,  5.25477007, 11.02204549,
+        10.15679528,  9.35167836,  9.06926079,  8.97241155,  8.90092807,
+         8.85984840,  8.81736827,  8.79317710,  7.89969626,  8.80588454,
+         8.42439218,  8.54289262,  8.47583370,  8.45090888,  8.47339339,
+         7.83525634,  8.20702843,  7.70559063,  7.32755997,  7.03887381,
+         6.68978720,  6.05450052,  5.88752022,  5.70661499,  5.78450695,
+         7.79780729,  7.26443867,  6.78151984,  6.67883169,  6.39024318,
+         6.09527958, 11.79156076, 11.10997644,  9.51377795,  8.67197068,
+         8.77140725,  8.65402716,  8.53923501,  8.85024712
+    }; // atomic <r^2>/<r^4> values
+
+    /*
+    covalent radii (taken from Pyykko and Atsumi, Chem. Eur. J. 15, 2009, 188-197)
+    values for metals decreased by 10 %
+    !      data rcov/
+    !     .  0.32, 0.46, 1.20, 0.94, 0.77, 0.75, 0.71, 0.63, 0.64, 0.67
+    !     ., 1.40, 1.25, 1.13, 1.04, 1.10, 1.02, 0.99, 0.96, 1.76, 1.54
+    !     ., 1.33, 1.22, 1.21, 1.10, 1.07, 1.04, 1.00, 0.99, 1.01, 1.09
+    !     ., 1.12, 1.09, 1.15, 1.10, 1.14, 1.17, 1.89, 1.67, 1.47, 1.39
+    !     ., 1.32, 1.24, 1.15, 1.13, 1.13, 1.08, 1.15, 1.23, 1.28, 1.26
+    !     ., 1.26, 1.23, 1.32, 1.31, 2.09, 1.76, 1.62, 1.47, 1.58, 1.57
+    !     ., 1.56, 1.55, 1.51, 1.52, 1.51, 1.50, 1.49, 1.49, 1.48, 1.53
+    !     ., 1.46, 1.37, 1.31, 1.23, 1.18, 1.16, 1.11, 1.12, 1.13, 1.32
+    !     ., 1.30, 1.30, 1.36, 1.31, 1.38, 1.42, 2.01, 1.81, 1.67, 1.58
+    !     ., 1.52, 1.53, 1.54, 1.55 /
+
+    these new data are scaled with k2=4./3.  and converted a_0 via
+    autoang=0.52917726d0
+    */
+
+    double rcov_ref[94] = {
+        0.80628308, 1.15903197, 3.02356173, 2.36845659, 1.94011865,
+        1.88972601, 1.78894056, 1.58736983, 1.61256616, 1.68815527,
+        3.52748848, 3.14954334, 2.84718717, 2.62041997, 2.77159820,
+        2.57002732, 2.49443835, 2.41884923, 4.43455700, 3.88023730,
+        3.35111422, 3.07395437, 3.04875805, 2.77159820, 2.69600923,
+        2.62041997, 2.51963467, 2.49443835, 2.54483100, 2.74640188,
+        2.82199085, 2.74640188, 2.89757982, 2.77159820, 2.87238349,
+        2.94797246, 4.76210950, 4.20778980, 3.70386304, 3.50229216,
+        3.32591790, 3.12434702, 2.89757982, 2.84718717, 2.84718717,
+        2.72120556, 2.89757982, 3.09915070, 3.22513231, 3.17473967,
+        3.17473967, 3.09915070, 3.32591790, 3.30072128, 5.26603625,
+        4.43455700, 4.08180818, 3.70386304, 3.98102289, 3.95582657,
+        3.93062995, 3.90543362, 3.80464833, 3.82984466, 3.80464833,
+        3.77945201, 3.75425569, 3.75425569, 3.72905937, 3.85504098,
+        3.67866672, 3.45189952, 3.30072128, 3.09915070, 2.97316878,
+        2.92277614, 2.79679452, 2.82199085, 2.84718717, 3.32591790,
+        3.27552496, 3.27552496, 3.42670319, 3.30072128, 3.47709584,
+        3.57788113, 5.06446567, 4.56053862, 4.20778980, 3.98102289,
+        3.82984466, 3.85504098, 3.88023730, 3.90543362
+    }; // covalent radii
+
+    for (int i = 0; i < ntypes; i++) {
+        r2r4[i+1] = r2r4_ref[atomic_numbers[i]-1];
+        rcov[i+1] = rcov_ref[atomic_numbers[i]-1];
+    }
+
+    // set r0ab
+    read_r0ab(atomic_numbers, ntypes);
+
+    // read c6ab
+    read_c6ab(atomic_numbers, ntypes);
+}
+
+/* ----------------------------------------------------------------------
+   Get derivative of C6 w.r.t. CN (used in PairD3::compute)
+
+   C6 = C6(CN_A, CN_B) == W(CN_A, CN_B) / Z(CN_A, CN_B)
+
+   This gives below from chain rule:
+   d(C6)/dr = d(C6)/d(CN_A) * d(CN_A)/dr + d(C6)/d(CN_B) * d(CN_B)/dr
+
+   So we can pre-calculate the d(C6)/d(CN_A), d(C6)/d(CN_B) part.
+
+   d(C6)/d(CN_i) = (dW/d(CN_i) * Z - W * dZ/d(CN_i)) / (W * W)
+        W : "denominator"
+        Z : "numerator"
+        dW/d(CN_i) : "d_denominator_i"
+        dZ/d(CN_j) : "d_numerator_j"
+
+    Z = Sum( L_ij(CN_A, CN_B) * C6_ref(CN_A_i, CN_B_j) ) over i, j
+    W = Sum( L_ij(CN_A, CN_B) ) over i, j
+
+   And the resulting derivative term is saved into
+   "dc6_iji_tot", "dc6_ijj_tot" array,
+   where we can find the value of d(C6)/d(CN_i)
+   by knowing the index of "iat", and "jat". ("idx_linij")
+
+   Also, c6 values will also be saved into "c6_ij_tot" array.
+
+   Here, as we only interested in *pair* of atoms, assume "iat" >= "jat".
+   Then "idx_linij" = "jat + (iat + 1) * iat / 2" have the order below.
+
+     idx_linij | j = 0  j = 1  j = 2  j = 3    ...
+---------------------------------------------
+        i = 0  |     0
+        i = 1  |     1      2
+        i = 2  |     3      4      5
+        i = 3  |     6      7      8      9
+          ...  |    ...    ...    ...    ...   ...
+
+------------------------------------------------------------------------- */
+
+__global__ void kernel_get_dC6_dCNij(
+    int maxij, float K3,
+    double *cn, int *mxc, float *****c6ab, int *type,
+    float *c6_ij_tot, float *dc6_iji_tot, float *dc6_ijj_tot
+) {
+    int iter = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (iter < maxij) {
+        int iat, jat;
+        ij_at_linij(iter, iat, jat);
+
+        const int atomtype_i = type[iat];
+        const int atomtype_j = type[jat];
+
+        const float cni = cn[iat];
+        const int mxci = mxc[atomtype_i];
+        const float cnj = cn[jat];
+        const int mxcj = mxc[atomtype_j];
+
+        float c6mem = -1e99f;
+        float r_save = 9999.0f;
+        double numerator = 0.0;
+        double denominator = 0.0;
+        double d_numerator_i = 0.0;
+        double d_denominator_i = 0.0;
+        double d_numerator_j = 0.0;
+        double d_denominator_j = 0.0;
+
+        for (int a = 0; a < mxci; a++) {
+            for (int b = 0; b < mxcj; b++) {
+                float c6ref = c6ab[atomtype_i][atomtype_j][a][b][0];
+
+                if (c6ref > 0.0f) {
+                    float cn_refi = c6ab[atomtype_i][atomtype_j][a][b][1];
+                    float cn_refj = c6ab[atomtype_i][atomtype_j][a][b][2];
+
+                    float r = (cn_refi - cni) * (cn_refi - cni) + (cn_refj - cnj) * (cn_refj - cnj);
+                    if (r < r_save) {
+                        r_save = r;
+                        c6mem = c6ref;
+                    }
+
+                    double expterm = exp(static_cast<double>(K3) * static_cast<double>(r)); // must be double
+                    numerator += c6ref * expterm;
+                    denominator += expterm;
+
+                    expterm *= 2.0f * K3;
+
+                    double term = expterm * (cni - cn_refi);
+                    d_numerator_i += c6ref * term;
+                    d_denominator_i += term;
+
+                    term = expterm * (cnj - cn_refj);
+                    d_numerator_j += c6ref * term;
+                    d_denominator_j += term;
+                }
+            }
+        }
+
+        if (denominator > 1e-99) {
+            const double denominator_rc = 1.0 / denominator; // must be double
+            const double unit_frac = numerator * denominator_rc;
+            c6_ij_tot[iter] = unit_frac;
+            dc6_iji_tot[iter] = denominator_rc * fma(unit_frac, -d_denominator_i, d_numerator_i); // must be double
+            dc6_ijj_tot[iter] = denominator_rc * fma(unit_frac, -d_denominator_j, d_numerator_j); // must be double
+            //const double denominator_rc = 1.0 / denominator;
+            //const float unit_frac = numerator * denominator_rc;
+            //c6_ij_tot[iter] = unit_frac;
+            //dc6_iji_tot[iter] = \
+            static_cast<float>(d_numerator_i * denominator_rc) - static_cast<float>(d_denominator_i * denominator_rc) * unit_frac;
+            //dc6_ijj_tot[iter] = \
+            static_cast<float>(d_numerator_j * denominator_rc) - static_cast<float>(d_denominator_j * denominator_rc) * unit_frac;
+        }
+        else {
+            c6_ij_tot[iter] = c6mem;
+            dc6_iji_tot[iter] = 0.0f;
+            dc6_ijj_tot[iter] = 0.0f;
+        }
+    }
+}
+
+void PairD3::get_dC6_dCNij() {
+    int n = atom->natoms;
+    int maxij = n * (n + 1) / 2;
+
+    //START_CUDA_TIMER();
+
+    int threadsPerBlock = 128;
+    int blocksPerGrid = (maxij + threadsPerBlock - 1) / threadsPerBlock;
+    kernel_get_dC6_dCNij<<<blocksPerGrid, threadsPerBlock>>>(
+        maxij, K3,
+        cn, mxc, c6ab, atomtype,
+        c6_ij_tot, dc6_iji_tot, dc6_ijj_tot
+    );
+    cudaDeviceSynchronize();
+
+    //STOP_CUDA_TIMER("get_dC6dCNij");
+}
+
+/* ----------------------------------------------------------------------
+   Get lattice vectors (used in PairD3::compute)
+
+   1) Save lattice vectors into "lat_v_1", "lat_v_2", "lat_v_3"
+   2) Calculate repetition criteria for vdw, cn
+   3) precaluclate tau (xyz shift due to cell repetition)
+
+------------------------------------------------------------------------- */
+
+void PairD3::set_lattice_vectors() {
+    double boxxlo = domain->boxlo[0];
+    double boxxhi = domain->boxhi[0];
+    double boxylo = domain->boxlo[1];
+    double boxyhi = domain->boxhi[1];
+    double boxzlo = domain->boxlo[2];
+    double boxzhi = domain->boxhi[2];
+    double xy = domain->xy;
+    double xz = domain->xz;
+    double yz = domain->yz;
+
+    lat_v_1[0] = (boxxhi - boxxlo) / AU_TO_ANG;
+    lat_v_1[1] =               0.0;
+    lat_v_1[2] =               0.0;
+    lat_v_2[0] =                xy / AU_TO_ANG;
+    lat_v_2[1] = (boxyhi - boxylo) / AU_TO_ANG;
+    lat_v_2[2] =               0.0;
+    lat_v_3[0] =                xz / AU_TO_ANG;
+    lat_v_3[1] =                yz / AU_TO_ANG;
+    lat_v_3[2] = (boxzhi - boxzlo) / AU_TO_ANG;
+
+    int vdwrx_save = 2 * rep_vdw[0] + 1;
+    int vdwry_save = 2 * rep_vdw[1] + 1;
+    int vdwrz_save = 2 * rep_vdw[2] + 1;
+    int cnrx_save = 2 * rep_cn[0] + 1;
+    int cnry_save = 2 * rep_cn[1] + 1;
+    int cnrz_save = 2 * rep_cn[2] + 1;
+
+    set_lattice_repetition_criteria(rthr, rep_vdw);
+    set_lattice_repetition_criteria(cnthr, rep_cn);
+
+    int vdw_range_x = 2 * rep_vdw[0] + 1;
+    int vdw_range_y = 2 * rep_vdw[1] + 1;
+    int vdw_range_z = 2 * rep_vdw[2] + 1;
+    int tau_loop_size_vdw = vdw_range_x * vdw_range_y * vdw_range_z * 3;
+    if (tau_loop_size_vdw != tau_idx_vdw_total_size) {
+        if (tau_idx_vdw != nullptr) {
+            for (int i = 0; i < vdwrx_save; i++) {
+                for (int j = 0; j < vdwry_save; j++) {
+                    for (int k = 0; k < vdwrz_save; k++) {
+                        cudaFree(tau_vdw[i][j][k]);
+                    }
+                    cudaFree(tau_vdw[i][j]);
+                }
+                cudaFree(tau_vdw[i]);
+            }
+            cudaFree(tau_vdw);
+            cudaFree(tau_idx_vdw);
+        }
+        tau_idx_vdw_total_size = tau_loop_size_vdw;
+        cudaMallocManaged(&tau_vdw, vdw_range_x * sizeof(float***));
+        for (int i = 0; i < vdw_range_x; i++) {
+            cudaMallocManaged(&tau_vdw[i], vdw_range_y * sizeof(float**));
+            for (int j = 0; j < vdw_range_y; j++) {
+                cudaMallocManaged(&tau_vdw[i][j], vdw_range_z * sizeof(float*));
+                for (int k = 0; k < vdw_range_z; k++) {
+                    cudaMallocManaged(&tau_vdw[i][j][k], 3 * sizeof(float));
+                }
+            }
+        }
+        cudaMallocManaged(&tau_idx_vdw, tau_idx_vdw_total_size * sizeof(int));
+    }
+
+    int cn_range_x  = 2 * rep_cn[0] + 1;
+    int cn_range_y  = 2 * rep_cn[1] + 1;
+    int cn_range_z  = 2 * rep_cn[2] + 1;
+    int tau_loop_size_cn = cn_range_x * cn_range_y * cn_range_z * 3;
+    if (tau_loop_size_cn != tau_idx_cn_total_size) {
+        if (tau_idx_cn != nullptr) {
+            for (int i = 0; i < cnrx_save; i++) {
+                for (int j = 0; j < cnry_save; j++) {
+                    for (int k = 0; k < cnrz_save; k++) {
+                        cudaFree(tau_cn[i][j][k]);
+                    }
+                    cudaFree(tau_cn[i][j]);
+                }
+                cudaFree(tau_cn[i]);
+            }
+            cudaFree(tau_cn);
+            cudaFree(tau_idx_cn);
+        }
+        tau_idx_cn_total_size = tau_loop_size_cn;
+        cudaMallocManaged(&tau_cn, cn_range_x * sizeof(float***));
+        for (int i = 0; i < cn_range_x; i++) {
+            cudaMallocManaged(&tau_cn[i], cn_range_y * sizeof(float**));
+            for (int j = 0; j < cn_range_y; j++) {
+                cudaMallocManaged(&tau_cn[i][j], cn_range_z * sizeof(float*));
+                for (int k = 0; k < cn_range_z; k++) {
+                    cudaMallocManaged(&tau_cn[i][j][k], 3 * sizeof(float));
+                }
+            }
+        }
+        cudaMallocManaged(&tau_idx_cn, tau_idx_cn_total_size * sizeof(int));
+    }
+}
+
+/* ----------------------------------------------------------------------
+   Set repetition criteria (used in PairD3::compute)
+
+   Needed as Periodic Boundary Condition should be considered.
+
+   As the cell may *not* be orthorhombic,
+   the dot product should be used between x/y/z direction and
+   corresponding cross product vector.
+------------------------------------------------------------------------- */
+
+void PairD3::set_lattice_repetition_criteria(float r_threshold, int* rep_v) {
+    double r_cutoff = sqrt(r_threshold);
+    double lat_cp_12[3], lat_cp_23[3], lat_cp_31[3];
+    double cos_value;
+
+    cross3(lat_v_1, lat_v_2, lat_cp_12);
+    cross3(lat_v_2, lat_v_3, lat_cp_23);
+    cross3(lat_v_3, lat_v_1, lat_cp_31);
+
+    cos_value = dot3(lat_cp_23, lat_v_1) / len3(lat_cp_23);
+    rep_v[0] = static_cast<int>(std::abs(r_cutoff / cos_value)) + 1;
+    cos_value = dot3(lat_cp_31, lat_v_2) / len3(lat_cp_31);
+    rep_v[1] = static_cast<int>(std::abs(r_cutoff / cos_value)) + 1;
+    cos_value = dot3(lat_cp_12, lat_v_3) / len3(lat_cp_12);
+    rep_v[2] = static_cast<int>(std::abs(r_cutoff / cos_value)) + 1;
+
+    if (domain->xperiodic == 0) { rep_v[0] = 0; }
+    if (domain->yperiodic == 0) { rep_v[1] = 0; }
+    if (domain->zperiodic == 0) { rep_v[2] = 0; }
+}
+
+/* ----------------------------------------------------------------------
+   Calculate Coordination Number (used in PairD3::compute)
+------------------------------------------------------------------------- */
+
+__global__ void kernel_get_coordination_number(
+    int maxij, int maxtau, float cnthr, float K1,
+    float *rcov, int *rep_cn, float ****tau_cn, int *tau_idx_cn, int *type, float **x,
+    double *cn
+) {
+    int iter = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (iter < maxij) {
+        int iat, jat;
+        ij_at_linij(iter, iat, jat);
+
+        float cn_local = 0.0f;
+
+        if (iat == jat) {
+            const float rcov_sum = rcov[type[iat]] * 2.0f;
+            for (int k = maxtau - 1; k >= 0; k -= 3) {
+                const int idx1 = tau_idx_cn[k-2];
+                const int idx2 = tau_idx_cn[k-1];
+                const int idx3 = tau_idx_cn[k];
+                if (idx1 == rep_cn[0] && idx2 == rep_cn[1] && idx3 == rep_cn[2]) { continue; }
+                const float rx = tau_cn[idx1][idx2][idx3][0];
+                const float ry = tau_cn[idx1][idx2][idx3][1];
+                const float rz = tau_cn[idx1][idx2][idx3][2];
+                const float r2 = rx * rx + ry * ry + rz * rz;
+                if (r2 <= cnthr) {
+                    const float r_rc = rsqrtf(r2);
+                    const float damp = 1.0f / (1.0f + expf(-K1 * ((rcov_sum * r_rc) - 1.0f)));
+                    cn_local += damp;
+                }
+            }
+            atomicAdd(&cn[iat], cn_local);
+        }
+
+        else {
+            const float rcov_sum = rcov[type[iat]] + rcov[type[jat]];
+            for (int k = maxtau - 1; k >= 0; k -= 3) {
+                const int idx1 = tau_idx_cn[k-2];
+                const int idx2 = tau_idx_cn[k-1];
+                const int idx3 = tau_idx_cn[k];
+                const float rx = x[jat][0] - x[iat][0] + tau_cn[idx1][idx2][idx3][0];
+                const float ry = x[jat][1] - x[iat][1] + tau_cn[idx1][idx2][idx3][1];
+                const float rz = x[jat][2] - x[iat][2] + tau_cn[idx1][idx2][idx3][2];
+                const float r2 = rx * rx + ry * ry + rz * rz;
+                if (r2 <= cnthr) {
+                    const float r_rc = rsqrtf(r2);
+                    const float damp = 1.0f / (1.0f + expf(-K1 * ((rcov_sum * r_rc) - 1.0f)));
+                    cn_local += damp;
+                }
+            }
+            atomicAdd(&cn[iat], cn_local);
+            atomicAdd(&cn[jat], cn_local);
+        }
+    }
+}
+
+void PairD3::get_coordination_number() {
+    int n = atom->natoms;
+    int maxij = n * (n + 1) / 2;
+    int maxtau = tau_idx_cn_total_size;
+
+    for (int i = 0; i < n; i++) {
+        cn[i] = 0.0;
+    }
+
+    //START_CUDA_TIMER();
+
+    int threadsPerBlock = 128;
+    int blocksPerGrid = (maxij + threadsPerBlock - 1) / threadsPerBlock;
+    kernel_get_coordination_number<<<blocksPerGrid, threadsPerBlock>>>(
+        maxij, maxtau, cnthr, K1,
+        rcov, rep_cn, tau_cn, tau_idx_cn, atomtype, x,
+        cn
+    );
+    cudaDeviceSynchronize();
+
+    //STOP_CUDA_TIMER("get_coord");
+}
+
+/* ----------------------------------------------------------------------
+   reallcate memory if the number of atoms has changed (used in PairD3::compute)
+------------------------------------------------------------------------- */
+
+void PairD3::reallocate_arrays() {
+    /* -------------- Destroy previous arrays -------------- */
+    cudaFree(cn);
+    for (int i = 0; i < n_save; i++) { cudaFree(x[i]); }; cudaFree(x);
+    cudaFree(dc6i);
+    for (int i = 0; i < n_save; i++) { cudaFree(f[i]); }; cudaFree(f);
+
+    cudaFree(dc6_iji_tot);
+    cudaFree(dc6_ijj_tot);
+    cudaFree(c6_ij_tot);
+
+    cudaFree(atomtype);
+    /* -------------- Destroy previous arrays -------------- */
+
+    /* -------------- Create new arrays -------------- */
+    int n = atom->natoms;
+    n_save = n;
+
+    cudaMallocManaged(&cn, n * sizeof(double));
+    cudaMallocManaged(&x, n * sizeof(float*)); for (int i = 0; i < n; i++) { cudaMallocManaged(&x[i], 3 * sizeof(float)); }
+    cudaMallocManaged(&dc6i, n * sizeof(double));
+    cudaMallocManaged(&f, n * sizeof(double*)); for (int i = 0; i < n; i++) { cudaMallocManaged(&f[i], 3 * sizeof(double)); }
+
+    int n_ij_combination = n * (n + 1) / 2;
+    cudaMallocManaged(&dc6_iji_tot, n_ij_combination * sizeof(float));
+    cudaMallocManaged(&dc6_ijj_tot, n_ij_combination * sizeof(float));
+    cudaMallocManaged(&c6_ij_tot,   n_ij_combination * sizeof(float));
+
+    cudaMallocManaged(&atomtype, n * sizeof(int));
+    /* -------------- Create new arrays -------------- */
+}
+
+void PairD3::reallocate_arrays_np1() {
+    /* -------------- Destroy previous arrays -------------- */
+    cudaFree(r2r4);
+    cudaFree(rcov);
+    cudaFree(mxc);
+    for (int i = 0; i < np1_save; i++) { cudaFree(r0ab[i]); }; cudaFree(r0ab);
+    for (int i = 0; i < np1_save; i++) {
+        for (int j = 0; j < np1_save; j++) {
+            for (int k = 0; k < MAXC; k++) {
+                for (int l = 0; l < MAXC; l++) {
+                    cudaFree(c6ab[i][j][k][l]);
+                }
+                cudaFree(c6ab[i][j][k]);
+            }
+            cudaFree(c6ab[i][j]);
+        }
+        cudaFree(c6ab[i]);
+    }
+    cudaFree(c6ab);
+    /* -------------- Destroy previous arrays -------------- */
+
+    /* -------------- Create new arrays -------------- */
+    int np1 = atom->ntypes + 1;
+    np1_save = np1;
+
+    cudaMallocManaged(&r2r4, np1 * sizeof(float));
+    cudaMallocManaged(&rcov, np1 * sizeof(float));
+    cudaMallocManaged(&mxc, np1 * sizeof(int));
+    cudaMallocManaged(&r0ab, np1 * sizeof(float*)); for (int i = 0; i < np1; i++) { cudaMallocManaged(&r0ab[i], np1 * sizeof(float)); }
+    cudaMallocManaged(&c6ab, np1 * sizeof(float****));
+    for (int i = 0; i < np1; i++) {
+        cudaMallocManaged(&c6ab[i], np1 * sizeof(float***));
+        for (int j = 0; j < np1; j++) {
+            cudaMallocManaged(&c6ab[i][j], MAXC * sizeof(float**));
+            for (int k = 0; k < MAXC; k++) {
+                cudaMallocManaged(&c6ab[i][j][k], MAXC * sizeof(float*));
+                for (int l = 0; l < MAXC; l++) {
+                    cudaMallocManaged(&c6ab[i][j][k][l], 3 * sizeof(float));
+                }
+            }
+        }
+    }
+    /* -------------- Create new arrays -------------- */
+}
+
+/* ----------------------------------------------------------------------
+  Initialize atomic positions & types (used in PairD3::compute)
+
+   As the default xyz from lammps does not assure that atoms are within unit cell,
+   this function shifts atoms into the unit cell.
+------------------------------------------------------------------------- */
+
+void PairD3::load_atom_info() {
+    double lat[3][3];
+    lat[0][0] = lat_v_1[0];
+    lat[0][1] = lat_v_2[0];
+    lat[0][2] = lat_v_3[0];
+    lat[1][0] = lat_v_1[1];
+    lat[1][1] = lat_v_2[1];
+    lat[1][2] = lat_v_3[1];
+    lat[2][0] = lat_v_1[2];
+    lat[2][1] = lat_v_2[2];
+    lat[2][2] = lat_v_3[2];
+
+    double det = lat[0][0] * lat[1][1] * lat[2][2]
+               + lat[0][1] * lat[1][2] * lat[2][0]
+               + lat[0][2] * lat[1][0] * lat[2][1]
+               - lat[0][2] * lat[1][1] * lat[2][0]
+               - lat[0][1] * lat[1][0] * lat[2][2]
+               - lat[0][0] * lat[1][2] * lat[2][1];
+
+    double lat_inv[3][3];
+    lat_inv[0][0] = (lat[1][1] * lat[2][2] - lat[1][2] * lat[2][1]) / det;
+    lat_inv[1][0] = (lat[1][2] * lat[2][0] - lat[1][0] * lat[2][2]) / det;
+    lat_inv[2][0] = (lat[1][0] * lat[2][1] - lat[1][1] * lat[2][0]) / det;
+    lat_inv[0][1] = (lat[0][2] * lat[2][1] - lat[0][1] * lat[2][2]) / det;
+    lat_inv[1][1] = (lat[0][0] * lat[2][2] - lat[0][2] * lat[2][0]) / det;
+    lat_inv[2][1] = (lat[0][1] * lat[2][0] - lat[0][0] * lat[2][1]) / det;
+    lat_inv[0][2] = (lat[0][1] * lat[1][2] - lat[0][2] * lat[1][1]) / det;
+    lat_inv[1][2] = (lat[0][2] * lat[1][0] - lat[0][0] * lat[1][2]) / det;
+    lat_inv[2][2] = (lat[0][0] * lat[1][1] - lat[0][1] * lat[1][0]) / det;
+
+    double a[3] = { 0.0 };
+    for (int iat = 0; iat < atom->natoms; iat++) {
+        for (int i = 0; i < 3; i++) {
+            a[i] = lat_inv[i][0] * (atom->x)[iat][0] / AU_TO_ANG +
+                   lat_inv[i][1] * (atom->x)[iat][1] / AU_TO_ANG +
+                   lat_inv[i][2] * (atom->x)[iat][2] / AU_TO_ANG;
+            a[i] -= floor(a[i]); // replaces the code below
+            //if      (a[i] > 1) { while (a[i] > 1) { a[i]--; } }
+            //else if (a[i] < 0) { while (a[i] < 0) { a[i]++; } }
+        }
+
+        for (int i = 0; i < 3; i++) {
+            x[iat][i] = (lat[i][0] * a[0] + lat[i][1] * a[1] + lat[i][2] * a[2]);
+        }
+    }
+}
+
+/* ----------------------------------------------------------------------
+   Precalculate tau array
+------------------------------------------------------------------------- */
+
+void PairD3::precalculate_tau_array() {
+    int xlim = rep_vdw[0];
+    int ylim = rep_vdw[1];
+    int zlim = rep_vdw[2];
+
+    int index = 0;
+    for (int taux = -xlim; taux <= xlim; taux++) {
+        for (int tauy = -ylim; tauy <= ylim; tauy++) {
+            for (int tauz = -zlim; tauz <= zlim; tauz++) {
+                tau_vdw[taux + xlim][tauy + ylim][tauz + zlim][0] = lat_v_1[0] * taux + lat_v_2[0] * tauy + lat_v_3[0] * tauz;
+                tau_vdw[taux + xlim][tauy + ylim][tauz + zlim][1] = lat_v_1[1] * taux + lat_v_2[1] * tauy + lat_v_3[1] * tauz;
+                tau_vdw[taux + xlim][tauy + ylim][tauz + zlim][2] = lat_v_1[2] * taux + lat_v_2[2] * tauy + lat_v_3[2] * tauz;
+                tau_idx_vdw[index++] = taux + xlim;
+                tau_idx_vdw[index++] = tauy + ylim;
+                tau_idx_vdw[index++] = tauz + zlim;
+            }
+        }
+    }
+
+    xlim = rep_cn[0];
+    ylim = rep_cn[1];
+    zlim = rep_cn[2];
+
+    index = 0;
+    for (int taux = -xlim; taux <= xlim; taux++) {
+        for (int tauy = -ylim; tauy <= ylim; tauy++) {
+            for (int tauz = -zlim; tauz <= zlim; tauz++) {
+                tau_cn[taux + xlim][tauy + ylim][tauz + zlim][0] = lat_v_1[0] * taux + lat_v_2[0] * tauy + lat_v_3[0] * tauz;
+                tau_cn[taux + xlim][tauy + ylim][tauz + zlim][1] = lat_v_1[1] * taux + lat_v_2[1] * tauy + lat_v_3[1] * tauz;
+                tau_cn[taux + xlim][tauy + ylim][tauz + zlim][2] = lat_v_1[2] * taux + lat_v_2[2] * tauy + lat_v_3[2] * tauz;
+                tau_idx_cn[index++] = taux + xlim;
+                tau_idx_cn[index++] = tauy + ylim;
+                tau_idx_cn[index++] = tauz + zlim;
+            }
+        }
+    }
+}
+
+/* ----------------------------------------------------------------------
+   Get forces (Zero damping)
+------------------------------------------------------------------------- */
+
+__global__ void kernel_get_forces_without_dC6_zero(
+    int maxij, int maxtau, float rthr, float s6, float s8, float a1, float a2, float alp6, float alp8,
+    float *r2r4, float **r0ab, int *rep_vdw, float ****tau_vdw, int *tau_idx_vdw, int *type, float **x,
+    float *c6_ij_tot, float *dc6_iji_tot, float *dc6_ijj_tot,
+    double *dc6i, double *disp, double **f, double **sigma
+) {
+    int iter = blockIdx.x * blockDim.x + threadIdx.x;
+
+    __shared__ float sigma_00[128];
+    __shared__ float sigma_01[128];
+    __shared__ float sigma_02[128];
+    __shared__ float sigma_10[128];
+    __shared__ float sigma_11[128];
+    __shared__ float sigma_12[128];
+    __shared__ float sigma_20[128];
+    __shared__ float sigma_21[128];
+    __shared__ float sigma_22[128];
+    __shared__ float disp_shared[128];
+
+    float sigma_local_00 = 0.0f;
+    float sigma_local_01 = 0.0f;
+    float sigma_local_02 = 0.0f;
+    float sigma_local_10 = 0.0f;
+    float sigma_local_11 = 0.0f;
+    float sigma_local_12 = 0.0f;
+    float sigma_local_20 = 0.0f;
+    float sigma_local_21 = 0.0f;
+    float sigma_local_22 = 0.0f;
+    float disp_local = 0.0f;
+
+    if (iter < maxij) {
+        int iat, jat;
+        ij_at_linij(iter, iat, jat);
+
+        float f_local[3] = { 0.0f };
+        float dc6i_local_i = 0.0f;
+        float dc6i_local_j = 0.0f;
+
+        const float c6 = c6_ij_tot[iter];
+        const float dc6iji = dc6_iji_tot[iter];
+        const float dc6ijj = dc6_ijj_tot[iter];
+
+        if (iat == jat) {
+            const int atomtype_i = type[iat];
+            const float r0 = r0ab[atomtype_i][atomtype_i];
+            const float unit_r2r4 = r2r4[atomtype_i];
+            const float r42 = unit_r2r4 * unit_r2r4;
+            const float unit_a1 = (a1 * r0);
+            const float unit_a2 = (a2 * r0);
+            const float s8r42 = s8 * r42;
+
+            for (int k = maxtau - 1; k >= 0; k -= 3) {
+                const int idx1 = tau_idx_vdw[k-2];
+                const int idx2 = tau_idx_vdw[k-1];
+                const int idx3 = tau_idx_vdw[k];
+
+                if (idx1 == rep_vdw[0] && idx2 == rep_vdw[1] && idx3 == rep_vdw[2]) { continue; }
+                const float rij[3] = {
+                    tau_vdw[idx1][idx2][idx3][0],
+                    tau_vdw[idx1][idx2][idx3][1],
+                    tau_vdw[idx1][idx2][idx3][2]
+                };
+                const float r2 = lensq3(rij);
+                if (r2 > rthr) { continue; }
+
+                const float r_rc = rsqrtf(r2);
+                float unit_rc_a1 = unit_a1 * r_rc;
+                float t6 = unit_rc_a1 * unit_rc_a1; // ^2
+                t6 *= unit_rc_a1; // ^3
+                t6 *= t6; // ^6
+                t6 *= unit_rc_a1; // ^7
+                t6 *= t6; // ^14
+                const float damp6 = 1.0f / fmaf(t6, 6.0f, 1.0f);
+                float unit_rc_a2 = unit_a2 * r_rc;
+                float t8 = unit_rc_a2 * unit_rc_a2; // ^2
+                t8 *= t8; // ^4
+                t8 *= t8; // ^8
+                t8 *= t8; // ^16
+                const float damp8 = 1.0f / fmaf(t8, 6.0f, 1.0f);
+                const float r2_rc = r_rc * r_rc; // 1.0 / r2
+                const float r6_rc = r2_rc * r2_rc * r2_rc;
+                const float r8_rc = r6_rc * r2_rc;
+                const float x1 = 3.0f * c6 * r8_rc * fmaf(r2_rc, s8r42 * damp8 * fmaf(3.0f * alp8 * t8, damp8, -4.0f), s6 * damp6 * fmaf(alp6 * t6, damp6, -1.0f));
+                //const float x1 = 0.5 * 6.0 * c6 * r8_rc * (s6 * damp6 * (14.0 * t6 * damp6 - 1.0) + s8r42 * r2_rc * damp8 * (48.0 * t8 * damp8 - 4.0));
+                //3.0 * alp6 = 48.0
+
+                const float vec[3] = {
+                    x1 * rij[0],
+                    x1 * rij[1],
+                    x1 * rij[2]
+                };
+
+                sigma_local_00 += vec[0] * rij[0];
+                sigma_local_01 += vec[0] * rij[1];
+                sigma_local_02 += vec[0] * rij[2];
+                sigma_local_10 += vec[1] * rij[0];
+                sigma_local_11 += vec[1] * rij[1];
+                sigma_local_12 += vec[1] * rij[2];
+                sigma_local_20 += vec[2] * rij[0];
+                sigma_local_21 += vec[2] * rij[1];
+                sigma_local_22 += vec[2] * rij[2];
+
+                const float dc6_rest = 0.5f * r6_rc * fmaf(3.0f * r2_rc, s8r42 * damp8, s6 * damp6);
+                //const float dc6_rest = 0.5 * r6_rc * (s6 * damp6 + 3.0 * s8r42 * damp8 * r2_rc);
+                disp_local -= dc6_rest * c6;
+                dc6i_local_i += dc6_rest * dc6iji;
+                dc6i_local_j += dc6_rest * dc6ijj;
+            }
+            atomicAdd(&dc6i[iat], dc6i_local_i);
+            atomicAdd(&dc6i[jat], dc6i_local_j);
+        }
+
+        else {
+            const int atomtype_i = type[iat];
+            const int atomtype_j = type[jat];
+            const float r0 = r0ab[atomtype_i][atomtype_j];
+            const float r42 = r2r4[atomtype_i] * r2r4[atomtype_j];
+            const float unit_a1 = (a1 * r0);
+            const float unit_a2 = (a2 * r0);
+            const float s8r42 = s8 * r42;
+
+            for (int k = maxtau - 1; k >= 0; k -= 3) {
+                const int idx1 = tau_idx_vdw[k-2];
+                const int idx2 = tau_idx_vdw[k-1];
+                const int idx3 = tau_idx_vdw[k];
+
+                const float rij[3] = {
+                    x[jat][0] - x[iat][0] + tau_vdw[idx1][idx2][idx3][0],
+                    x[jat][1] - x[iat][1] + tau_vdw[idx1][idx2][idx3][1],
+                    x[jat][2] - x[iat][2] + tau_vdw[idx1][idx2][idx3][2]
+                };
+                const float r2 = lensq3(rij);
+                if (r2 > rthr) { continue; }
+
+                const float r_rc = rsqrtf(r2);
+                float unit_rc_a1 = unit_a1 * r_rc;
+                float t6 = unit_rc_a1 * unit_rc_a1; // ^2
+                t6 *= unit_rc_a1; // ^3
+                t6 *= t6; // ^6
+                t6 *= unit_rc_a1; // ^7
+                t6 *= t6; // ^14
+                const float damp6 = 1.0f / fmaf(t6, 6.0f, 1.0f);
+                float unit_rc_a2 = unit_a2 * r_rc;
+                float t8 = unit_rc_a2 * unit_rc_a2; // ^2
+                t8 *= t8; // ^4
+                t8 *= t8; // ^8
+                t8 *= t8; // ^16
+                const float damp8 = 1.0f / fmaf(t8, 6.0f, 1.0f);
+                const float r2_rc = r_rc * r_rc; // 1.0 / r2
+                const float r6_rc = r2_rc * r2_rc * r2_rc;
+                const float r8_rc = r6_rc * r2_rc;
+                const float x1 = 6.0f * c6 * r8_rc * fmaf(r2_rc, s8r42 * damp8 * fmaf(3.0f * alp8 * t8, damp8, -4.0f), s6 * damp6 * fmaf(alp6 * t6, damp6, -1.0f));
+                //const float x1 = 6.0 * c6 * r8_rc * (s6 * damp6 * (14.0 * t6 * damp6 - 1.0) + s8r42 * r2_rc * damp8 * (48.0 * t8 * damp8 - 4.0));
+                //3.0 * alp6 = 48.0
+
+                const float vec[3] = {
+                    x1 * rij[0],
+                    x1 * rij[1],
+                    x1 * rij[2]
+                };
+
+                f_local[0] -= vec[0];
+                f_local[1] -= vec[1];
+                f_local[2] -= vec[2];
+
+                sigma_local_00 += vec[0] * rij[0];
+                sigma_local_01 += vec[0] * rij[1];
+                sigma_local_02 += vec[0] * rij[2];
+                sigma_local_10 += vec[1] * rij[0];
+                sigma_local_11 += vec[1] * rij[1];
+                sigma_local_12 += vec[1] * rij[2];
+                sigma_local_20 += vec[2] * rij[0];
+                sigma_local_21 += vec[2] * rij[1];
+                sigma_local_22 += vec[2] * rij[2];
+
+                const float dc6_rest = r6_rc * fmaf(3.0f * r2_rc, s8r42 * damp8, s6 * damp6);
+                //const float dc6_rest = r6_rc * (s6 * damp6 + 3.0 * s8r42 * damp8 * r2_rc);
+                disp_local -= dc6_rest * c6;
+                dc6i_local_i += dc6_rest * dc6iji;
+                dc6i_local_j += dc6_rest * dc6ijj;
+            }
+            atomicAdd(&dc6i[iat], dc6i_local_i);
+            atomicAdd(&dc6i[jat], dc6i_local_j);
+            atomicAdd(&f[iat][0], f_local[0]);
+            atomicAdd(&f[iat][1], f_local[1]);
+            atomicAdd(&f[iat][2], f_local[2]);
+            atomicAdd(&f[jat][0], -f_local[0]);
+            atomicAdd(&f[jat][1], -f_local[1]);
+            atomicAdd(&f[jat][2], -f_local[2]);
+        }
+    }
+
+    sigma_00[threadIdx.x] = sigma_local_00;
+    sigma_01[threadIdx.x] = sigma_local_01;
+    sigma_02[threadIdx.x] = sigma_local_02;
+    sigma_10[threadIdx.x] = sigma_local_10;
+    sigma_11[threadIdx.x] = sigma_local_11;
+    sigma_12[threadIdx.x] = sigma_local_12;
+    sigma_20[threadIdx.x] = sigma_local_20;
+    sigma_21[threadIdx.x] = sigma_local_21;
+    sigma_22[threadIdx.x] = sigma_local_22;
+    disp_shared[threadIdx.x] = disp_local;
+    __syncthreads();
+
+    for (int s=blockDim.x/2; s>0; s>>=1) {
+        if (threadIdx.x < s) {
+            sigma_00[threadIdx.x] += sigma_00[threadIdx.x + s];
+            sigma_01[threadIdx.x] += sigma_01[threadIdx.x + s];
+            sigma_02[threadIdx.x] += sigma_02[threadIdx.x + s];
+            sigma_10[threadIdx.x] += sigma_10[threadIdx.x + s];
+            sigma_11[threadIdx.x] += sigma_11[threadIdx.x + s];
+            sigma_12[threadIdx.x] += sigma_12[threadIdx.x + s];
+            sigma_20[threadIdx.x] += sigma_20[threadIdx.x + s];
+            sigma_21[threadIdx.x] += sigma_21[threadIdx.x + s];
+            sigma_22[threadIdx.x] += sigma_22[threadIdx.x + s];
+            disp_shared[threadIdx.x] += disp_shared[threadIdx.x + s];
+        }
+        __syncthreads();
+    }
+
+    if (threadIdx.x == 0) {
+        atomicAdd(&sigma[0][0], sigma_00[0]);
+        atomicAdd(&sigma[0][1], sigma_01[0]);
+        atomicAdd(&sigma[0][2], sigma_02[0]);
+        atomicAdd(&sigma[1][0], sigma_10[0]);
+        atomicAdd(&sigma[1][1], sigma_11[0]);
+        atomicAdd(&sigma[1][2], sigma_12[0]);
+        atomicAdd(&sigma[2][0], sigma_20[0]);
+        atomicAdd(&sigma[2][1], sigma_21[0]);
+        atomicAdd(&sigma[2][2], sigma_22[0]);
+        atomicAdd(disp, disp_shared[0]);
+    }
+}
+
+void PairD3::get_forces_without_dC6_zero() {
+    int n = atom->natoms;
+    int maxij = n * (n + 1) / 2;
+    int maxtau = tau_idx_vdw_total_size;
+
+    *disp = 0.0;
+
+    for (int dim = 0; dim < n; dim++) { dc6i[dim] = 0.0; }
+
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < 3; j++) {
+            f[i][j] = 0.0;
+        }
+    }
+
+    for (int ii = 0; ii < 3; ii++) {
+        for (int jj = 0; jj < 3; jj++) {
+            sigma[ii][jj] = 0.0;
+        }
+    }
+
+    //START_CUDA_TIMER();
+
+    int threadsPerBlock = 128;
+    int blocksPerGrid = (maxij + threadsPerBlock - 1) / threadsPerBlock;
+    kernel_get_forces_without_dC6_zero<<<blocksPerGrid, threadsPerBlock>>>(
+        maxij, maxtau, rthr, s6, s8, a1, a2, alp6, alp8,
+        r2r4, r0ab, rep_vdw, tau_vdw, tau_idx_vdw, atomtype, x,
+        c6_ij_tot, dc6_iji_tot, dc6_ijj_tot,
+        dc6i, disp, f, sigma
+    );
+    cudaDeviceSynchronize();
+    disp_total = *disp;
+
+    //STOP_CUDA_TIMER("get_forces_without");
+}
+
+__global__ void kernel_get_forces_without_dC6_bj(
+    int maxij, int maxtau, float rthr, float s6, float s8, float a1, float a2,
+    float *r2r4, int *rep_vdw, float ****tau_vdw, int *tau_idx_vdw, int *type, float **x,
+    float *c6_ij_tot, float *dc6_iji_tot, float *dc6_ijj_tot,
+    double *dc6i, double *disp, double **f, double **sigma
+) {
+    int iter = blockIdx.x * blockDim.x + threadIdx.x;
+
+    __shared__ float sigma_00[128];
+    __shared__ float sigma_01[128];
+    __shared__ float sigma_02[128];
+    __shared__ float sigma_10[128];
+    __shared__ float sigma_11[128];
+    __shared__ float sigma_12[128];
+    __shared__ float sigma_20[128];
+    __shared__ float sigma_21[128];
+    __shared__ float sigma_22[128];
+    __shared__ float disp_shared[128];
+
+    float sigma_local_00 = 0.0f;
+    float sigma_local_01 = 0.0f;
+    float sigma_local_02 = 0.0f;
+    float sigma_local_10 = 0.0f;
+    float sigma_local_11 = 0.0f;
+    float sigma_local_12 = 0.0f;
+    float sigma_local_20 = 0.0f;
+    float sigma_local_21 = 0.0f;
+    float sigma_local_22 = 0.0f;
+    float disp_local = 0.0f;
+
+    if (iter < maxij) {
+        int iat, jat;
+        ij_at_linij(iter, iat, jat);
+
+        float f_local[3] = { 0.0f };
+        float dc6i_local_i = 0.0f;
+        float dc6i_local_j = 0.0f;
+
+        const float c6 = c6_ij_tot[iter];
+        const float dc6iji = dc6_iji_tot[iter];
+        const float dc6ijj = dc6_ijj_tot[iter];
+
+        if (iat == jat) {
+            const float unit_r2r4 = r2r4[type[iat]];
+            const float r42x3 = unit_r2r4 * unit_r2r4 * 3.0f;
+            const float R0 = fmaf(a1, sqrtf(r42x3), a2);
+            const float R0_2 = R0 * R0;
+            const float R0_6 = R0_2 * R0_2 * R0_2;
+            const float R0_8 = R0_6 * R0_2;
+            const float s8r42x3 = s8 * r42x3;
+
+            for (int k = maxtau - 1; k >= 0; k -= 3) {
+                const int idx1 = tau_idx_vdw[k-2];
+                const int idx2 = tau_idx_vdw[k-1];
+                const int idx3 = tau_idx_vdw[k];
+
+                if (idx1 == rep_vdw[0] && idx2 == rep_vdw[1] && idx3 == rep_vdw[2]) { continue; }
+                const float rij[3] = {
+                    tau_vdw[idx1][idx2][idx3][0],
+                    tau_vdw[idx1][idx2][idx3][1],
+                    tau_vdw[idx1][idx2][idx3][2]
+                };
+                const float r2 = lensq3(rij);
+                if (r2 > rthr) { continue; }
+
+                const float r = sqrtf(r2);
+                const float r5 = r2 * r2 * r;
+                const float r7 = r5 * r2;
+                const float t6_rc = 1.0f / fmaf(r5, r, R0_6);
+                const float t8_rc = 1.0f / fmaf(r7, r, R0_8);
+                const float t6_sqrc = t6_rc * t6_rc;
+                const float t8_sqrc = t8_rc * t8_rc;
+                const float x1 = -c6 * fmaf(4.0f * s8r42x3 * r7, t8_sqrc, 3.0f * s6 * r5 * t6_sqrc);
+                //const float x1 = 0.5 * -c6 * (6.0 * s6 * r5 * t6_sqrc + 8.0 * s8r42x3 * r7 * t8_sqrc;
+
+                const float r_rc = 1.0f / r; // rsqrt(r2)
+                const float vec[3] = {
+                    x1 * rij[0] * r_rc,
+                    x1 * rij[1] * r_rc,
+                    x1 * rij[2] * r_rc
+                };
+
+                sigma_local_00 += vec[0] * rij[0];
+                sigma_local_01 += vec[0] * rij[1];
+                sigma_local_02 += vec[0] * rij[2];
+                sigma_local_10 += vec[1] * rij[0];
+                sigma_local_11 += vec[1] * rij[1];
+                sigma_local_12 += vec[1] * rij[2];
+                sigma_local_20 += vec[2] * rij[0];
+                sigma_local_21 += vec[2] * rij[1];
+                sigma_local_22 += vec[2] * rij[2];
+
+                const float dc6_rest = 0.5f * fmaf(s8r42x3, t8_rc, s6 * t6_rc);
+                //const float dc6_rest = 0.5 * s6 * t6_rc + s8r42x3 * t8_rc;
+                disp_local -= dc6_rest * c6;
+                dc6i_local_i += dc6_rest * dc6iji;
+                dc6i_local_j += dc6_rest * dc6ijj;
+            }
+            atomicAdd(&dc6i[iat], dc6i_local_i);
+            atomicAdd(&dc6i[jat], dc6i_local_j);
+        }
+
+        else {
+            const float r42x3 = r2r4[type[iat]] * r2r4[type[jat]] * 3.0f;
+            const float R0 = fmaf(a1, sqrtf(r42x3), a2);
+            const float R0_2 = R0 * R0;
+            const float R0_6 = R0_2 * R0_2 * R0_2;
+            const float R0_8 = R0_6 * R0_2;
+            const float s8r42x3 = s8 * r42x3;
+
+            for (int k = maxtau - 1; k >= 0; k -= 3) {
+                const int idx1 = tau_idx_vdw[k-2];
+                const int idx2 = tau_idx_vdw[k-1];
+                const int idx3 = tau_idx_vdw[k];
+                const float rij[3] = {
+                    x[jat][0] - x[iat][0] + tau_vdw[idx1][idx2][idx3][0],
+                    x[jat][1] - x[iat][1] + tau_vdw[idx1][idx2][idx3][1],
+                    x[jat][2] - x[iat][2] + tau_vdw[idx1][idx2][idx3][2]
+                };
+                const float r2 = lensq3(rij);
+                if (r2 > rthr) { continue; }
+
+                const float r = sqrtf(r2);
+                const float r5 = r2 * r2 * r;
+                const float r7 = r5 * r2;
+                const float t6_rc = 1.0f / fmaf(r5, r, R0_6);
+                const float t8_rc = 1.0f / fmaf(r7, r, R0_8);
+                const float t6_sqrc = t6_rc * t6_rc;
+                const float t8_sqrc = t8_rc * t8_rc;
+                const float x1 = -c6 * fmaf(8.0f * s8r42x3 * r7, t8_sqrc, 6.0f * s6 * r5 * t6_sqrc);
+                //const float x1 = -c6 * (6.0 * s6 * r5 * t6_sqrc + 8.0 * s8r42x3 * r7 * t8_sqrc;
+
+                const float r_rc = 1.0f / r; // rsqrt(r2)
+                const float vec[3] = {
+                    x1 * rij[0] * r_rc,
+                    x1 * rij[1] * r_rc,
+                    x1 * rij[2] * r_rc
+                };
+
+                f_local[0] -= vec[0];
+                f_local[1] -= vec[1];
+                f_local[2] -= vec[2];
+
+                sigma_local_00 += vec[0] * rij[0];
+                sigma_local_01 += vec[0] * rij[1];
+                sigma_local_02 += vec[0] * rij[2];
+                sigma_local_10 += vec[1] * rij[0];
+                sigma_local_11 += vec[1] * rij[1];
+                sigma_local_12 += vec[1] * rij[2];
+                sigma_local_20 += vec[2] * rij[0];
+                sigma_local_21 += vec[2] * rij[1];
+                sigma_local_22 += vec[2] * rij[2];
+
+                const float dc6_rest = fmaf(s8r42x3, t8_rc, s6 * t6_rc);
+                //const float dc6_rest = s6 * t6_rc + s8r42x3 * t8_rc;
+                disp_local -= dc6_rest * c6;
+                dc6i_local_i += dc6_rest * dc6iji;
+                dc6i_local_j += dc6_rest * dc6ijj;
+            }
+            atomicAdd(&dc6i[iat], dc6i_local_i);
+            atomicAdd(&dc6i[jat], dc6i_local_j);
+            atomicAdd(&f[iat][0], f_local[0]);
+            atomicAdd(&f[iat][1], f_local[1]);
+            atomicAdd(&f[iat][2], f_local[2]);
+            atomicAdd(&f[jat][0], -f_local[0]);
+            atomicAdd(&f[jat][1], -f_local[1]);
+            atomicAdd(&f[jat][2], -f_local[2]);
+        }
+    }
+
+    sigma_00[threadIdx.x] = sigma_local_00;
+    sigma_01[threadIdx.x] = sigma_local_01;
+    sigma_02[threadIdx.x] = sigma_local_02;
+    sigma_10[threadIdx.x] = sigma_local_10;
+    sigma_11[threadIdx.x] = sigma_local_11;
+    sigma_12[threadIdx.x] = sigma_local_12;
+    sigma_20[threadIdx.x] = sigma_local_20;
+    sigma_21[threadIdx.x] = sigma_local_21;
+    sigma_22[threadIdx.x] = sigma_local_22;
+    disp_shared[threadIdx.x] = disp_local;
+    __syncthreads();
+
+    for (int s=blockDim.x/2; s>0; s>>=1) {
+        if (threadIdx.x < s) {
+            sigma_00[threadIdx.x] += sigma_00[threadIdx.x + s];
+            sigma_01[threadIdx.x] += sigma_01[threadIdx.x + s];
+            sigma_02[threadIdx.x] += sigma_02[threadIdx.x + s];
+            sigma_10[threadIdx.x] += sigma_10[threadIdx.x + s];
+            sigma_11[threadIdx.x] += sigma_11[threadIdx.x + s];
+            sigma_12[threadIdx.x] += sigma_12[threadIdx.x + s];
+            sigma_20[threadIdx.x] += sigma_20[threadIdx.x + s];
+            sigma_21[threadIdx.x] += sigma_21[threadIdx.x + s];
+            sigma_22[threadIdx.x] += sigma_22[threadIdx.x + s];
+            disp_shared[threadIdx.x] += disp_shared[threadIdx.x + s];
+        }
+        __syncthreads();
+    }
+
+    if (threadIdx.x == 0) {
+        atomicAdd(&sigma[0][0], sigma_00[0]);
+        atomicAdd(&sigma[0][1], sigma_01[0]);
+        atomicAdd(&sigma[0][2], sigma_02[0]);
+        atomicAdd(&sigma[1][0], sigma_10[0]);
+        atomicAdd(&sigma[1][1], sigma_11[0]);
+        atomicAdd(&sigma[1][2], sigma_12[0]);
+        atomicAdd(&sigma[2][0], sigma_20[0]);
+        atomicAdd(&sigma[2][1], sigma_21[0]);
+        atomicAdd(&sigma[2][2], sigma_22[0]);
+        atomicAdd(disp, disp_shared[0]);
+    }
+}
+
+void PairD3::get_forces_without_dC6_bj() {
+    int n = atom->natoms;
+    int maxij = n * (n + 1) / 2;
+    int maxtau = tau_idx_vdw_total_size;
+
+    *disp = 0.0;
+
+    for (int dim = 0; dim < n; dim++) { dc6i[dim] = 0.0; }
+
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < 3; j++) {
+            f[i][j] = 0.0;
+        }
+    }
+
+    for (int ii = 0; ii < 3; ii++) {
+        for (int jj = 0; jj < 3; jj++) {
+            sigma[ii][jj] = 0.0;
+        }
+    }
+
+    //START_CUDA_TIMER();
+
+    int threadsPerBlock = 128;
+    int blocksPerGrid = (maxij + threadsPerBlock - 1) / threadsPerBlock;
+    kernel_get_forces_without_dC6_bj<<<blocksPerGrid, threadsPerBlock>>>(
+        maxij, maxtau, rthr, s6, s8, a1, a2,
+        r2r4, rep_vdw, tau_vdw, tau_idx_vdw, atomtype, x,
+        c6_ij_tot, dc6_iji_tot, dc6_ijj_tot,
+        dc6i, disp, f, sigma
+    );
+    cudaDeviceSynchronize();
+    disp_total = *disp;
+
+    //STOP_CUDA_TIMER("get_forces_without");
+}
+
+void PairD3::get_forces_without_dC6_zerom() {}
+void PairD3::get_forces_without_dC6_bjm() {}
+
+void PairD3::get_forces_without_dC6() {
+    void (PairD3::*get_forces_without_dC6_damp[4])() = {
+        &PairD3::get_forces_without_dC6_zero,
+        &PairD3::get_forces_without_dC6_bj,
+        &PairD3::get_forces_without_dC6_zerom,
+        &PairD3::get_forces_without_dC6_bjm
+    };
+
+    (this->*get_forces_without_dC6_damp[damping])();
+}
+
+__global__ void kernel_get_forces_with_dC6(
+    int maxij, int maxtau, float cnthr, float K1,
+    double *dc6i, float *rcov, int *rep_cn, float ****tau_cn, int *tau_idx_cn, int *type, float **x,
+    double **f, double **sigma
+) {
+    int iter = blockIdx.x * blockDim.x + threadIdx.x;
+
+    __shared__ float sigma_00[128];
+    __shared__ float sigma_01[128];
+    __shared__ float sigma_02[128];
+    __shared__ float sigma_10[128];
+    __shared__ float sigma_11[128];
+    __shared__ float sigma_12[128];
+    __shared__ float sigma_20[128];
+    __shared__ float sigma_21[128];
+    __shared__ float sigma_22[128];
+
+    float sigma_local_00 = 0.0f;
+    float sigma_local_01 = 0.0f;
+    float sigma_local_02 = 0.0f;
+    float sigma_local_10 = 0.0f;
+    float sigma_local_11 = 0.0f;
+    float sigma_local_12 = 0.0f;
+    float sigma_local_20 = 0.0f;
+    float sigma_local_21 = 0.0f;
+    float sigma_local_22 = 0.0f;
+
+    float f_local[3] = { 0.0f };
+
+    if (iter < maxij) {
+        int iat, jat;
+        ij_at_linij(iter, iat, jat);
+
+        if (iat == jat) {
+            const float rcov_sum = rcov[type[iat]] * 2.0f;
+            const float dc6i_sum = dc6i[iat];
+
+            for (int k = maxtau - 1; k >= 0; k -= 3) {
+                const int idx1 = tau_idx_cn[k-2];
+                const int idx2 = tau_idx_cn[k-1];
+                const int idx3 = tau_idx_cn[k];
+
+                if (idx1 == rep_cn[0] && idx2 == rep_cn[1] && idx3 == rep_cn[2]) { continue; }
+                const float rij[3] = {
+                    tau_cn[idx1][idx2][idx3][0],
+                    tau_cn[idx1][idx2][idx3][1],
+                    tau_cn[idx1][idx2][idx3][2],
+                };
+                const float r2 = lensq3(rij);
+                if (r2 >= cnthr) { continue; }
+
+                const float r_rc = rsqrtf(r2);
+                const float expterm = expf(-K1 * (rcov_sum * r_rc - 1.0f));
+                const float unit_rc = 1.0f / (r2 * (expterm + 1.0f) * (expterm + 1.0f));
+                const float dcnn = -K1 * rcov_sum * expterm * unit_rc;
+                const float x1 = dcnn * dc6i_sum;
+
+                const float vec[3] = {
+                    x1 * rij[0] * r_rc,
+                    x1 * rij[1] * r_rc,
+                    x1 * rij[2] * r_rc
+                };
+
+                sigma_local_00 += vec[0] * rij[0];
+                sigma_local_01 += vec[0] * rij[1];
+                sigma_local_02 += vec[0] * rij[2];
+                sigma_local_10 += vec[1] * rij[0];
+                sigma_local_11 += vec[1] * rij[1];
+                sigma_local_12 += vec[1] * rij[2];
+                sigma_local_20 += vec[2] * rij[0];
+                sigma_local_21 += vec[2] * rij[1];
+                sigma_local_22 += vec[2] * rij[2];
+            }
+        }
+
+        else {
+            const float rcov_sum = rcov[type[iat]] + rcov[type[jat]];
+            const float dc6i_sum = dc6i[iat] + dc6i[jat];
+
+            for (int k = maxtau - 1; k >= 0; k -= 3) {
+                const int idx1 = tau_idx_cn[k-2];
+                const int idx2 = tau_idx_cn[k-1];
+                const int idx3 = tau_idx_cn[k];
+
+                const float rij[3] = {
+                    x[jat][0] - x[iat][0] + tau_cn[idx1][idx2][idx3][0],
+                    x[jat][1] - x[iat][1] + tau_cn[idx1][idx2][idx3][1],
+                    x[jat][2] - x[iat][2] + tau_cn[idx1][idx2][idx3][2]
+                };
+                const float r2 = lensq3(rij);
+                if (r2 >= cnthr) { continue; }
+
+                const float r_rc = rsqrtf(r2);
+                const float expterm = expf(-K1 * (rcov_sum * r_rc - 1.0f));
+                const float unit_rc = 1.0f / (r2 * (expterm + 1.0f) * (expterm + 1.0f));
+                const float dcnn = -K1 * rcov_sum * expterm * unit_rc;
+                const float x1 = dcnn * dc6i_sum;
+
+                const float vec[3] = {
+                    x1 * rij[0] * r_rc,
+                    x1 * rij[1] * r_rc,
+                    x1 * rij[2] * r_rc
+                };
+
+                f_local[0] -= vec[0];
+                f_local[1] -= vec[1];
+                f_local[2] -= vec[2];
+
+                sigma_local_00 += vec[0] * rij[0];
+                sigma_local_01 += vec[0] * rij[1];
+                sigma_local_02 += vec[0] * rij[2];
+                sigma_local_10 += vec[1] * rij[0];
+                sigma_local_11 += vec[1] * rij[1];
+                sigma_local_12 += vec[1] * rij[2];
+                sigma_local_20 += vec[2] * rij[0];
+                sigma_local_21 += vec[2] * rij[1];
+                sigma_local_22 += vec[2] * rij[2];
+            }
+            atomicAdd(&f[iat][0], f_local[0]);
+            atomicAdd(&f[iat][1], f_local[1]);
+            atomicAdd(&f[iat][2], f_local[2]);
+            atomicAdd(&f[jat][0], -f_local[0]);
+            atomicAdd(&f[jat][1], -f_local[1]);
+            atomicAdd(&f[jat][2], -f_local[2]);
+        }
+    }
+
+    sigma_00[threadIdx.x] = sigma_local_00;
+    sigma_01[threadIdx.x] = sigma_local_01;
+    sigma_02[threadIdx.x] = sigma_local_02;
+    sigma_10[threadIdx.x] = sigma_local_10;
+    sigma_11[threadIdx.x] = sigma_local_11;
+    sigma_12[threadIdx.x] = sigma_local_12;
+    sigma_20[threadIdx.x] = sigma_local_20;
+    sigma_21[threadIdx.x] = sigma_local_21;
+    sigma_22[threadIdx.x] = sigma_local_22;
+    __syncthreads();
+
+    for (int s=blockDim.x/2; s>0; s>>=1) {
+        if (threadIdx.x < s) {
+            sigma_00[threadIdx.x] += sigma_00[threadIdx.x + s];
+            sigma_01[threadIdx.x] += sigma_01[threadIdx.x + s];
+            sigma_02[threadIdx.x] += sigma_02[threadIdx.x + s];
+            sigma_10[threadIdx.x] += sigma_10[threadIdx.x + s];
+            sigma_11[threadIdx.x] += sigma_11[threadIdx.x + s];
+            sigma_12[threadIdx.x] += sigma_12[threadIdx.x + s];
+            sigma_20[threadIdx.x] += sigma_20[threadIdx.x + s];
+            sigma_21[threadIdx.x] += sigma_21[threadIdx.x + s];
+            sigma_22[threadIdx.x] += sigma_22[threadIdx.x + s];
+        }
+        __syncthreads();
+    }
+
+    if (threadIdx.x == 0) {
+        atomicAdd(&sigma[0][0], sigma_00[0]);
+        atomicAdd(&sigma[0][1], sigma_01[0]);
+        atomicAdd(&sigma[0][2], sigma_02[0]);
+        atomicAdd(&sigma[1][0], sigma_10[0]);
+        atomicAdd(&sigma[1][1], sigma_11[0]);
+        atomicAdd(&sigma[1][2], sigma_12[0]);
+        atomicAdd(&sigma[2][0], sigma_20[0]);
+        atomicAdd(&sigma[2][1], sigma_21[0]);
+        atomicAdd(&sigma[2][2], sigma_22[0]);
+    }
+}
+
+void PairD3::get_forces_with_dC6() {
+    int n = atom->natoms;
+    int maxij = n * (n + 1) / 2;
+    int maxtau = tau_idx_cn_total_size;
+
+    //START_CUDA_TIMER();
+
+    int threadsPerBlock = 128;
+    int blocksPerGrid = (maxij + threadsPerBlock - 1) / threadsPerBlock;
+    kernel_get_forces_with_dC6<<<blocksPerGrid, threadsPerBlock>>>(
+        maxij, maxtau, cnthr, K1,
+        dc6i, rcov, rep_cn, tau_cn, tau_idx_cn, atomtype, x,
+        f, sigma
+    );
+    cudaDeviceSynchronize();
+
+    //STOP_CUDA_TIMER("get_forces_with");
+}
+
+/* ----------------------------------------------------------------------
+   Update energy, force, and stress
+------------------------------------------------------------------------- */
+
+void PairD3::update() {
+    int n = atom->natoms;
+
+    // unit: eV <- eng_vdwl
+    result_E = disp_total * AU_TO_EV;
+
+    // unit: eV/Å, flatten for wrapper <- atom->f (f_local)
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < 3; j++) {
+            result_F[i * 3 + j] = f[i][j] * AU_TO_EV / AU_TO_ANG;
+        }
+    }
+
+    // unit: eV, virial, xx, yy, zz, xy, xz, yz <- virial
+    result_S[0] = sigma[0][0] * AU_TO_EV;
+    result_S[1] = sigma[1][1] * AU_TO_EV;
+    result_S[2] = sigma[2][2] * AU_TO_EV;
+    result_S[3] = sigma[0][1] * AU_TO_EV;
+    result_S[4] = sigma[0][2] * AU_TO_EV;
+    result_S[5] = sigma[1][2] * AU_TO_EV;
+}
+
+/* ----------------------------------------------------------------------
+   Compute : energy, force, and stress (Required)
+------------------------------------------------------------------------- */
+
+void PairD3::compute() {
+    if (atom->natoms != n_save)  { reallocate_arrays(); }
+
+    set_lattice_vectors();
+    precalculate_tau_array();
+    load_atom_info();
+
+    cudaMemcpy(atomtype, atom->type, atom->natoms * sizeof(int), cudaMemcpyHostToDevice);
+
+    get_coordination_number();
+    get_dC6_dCNij();
+    get_forces_without_dC6();
+    get_forces_with_dC6();
+
+    update();
+
+    CHECK_CUDA_ERROR();
+}
+
+int main() {
+}
+
+extern "C" { // C wrapper for ctypes or cffi
+    PairD3* pair_init() {
+        return new PairD3();
+    }
+
+    void pair_set_atom(PairD3* pair, int natoms, int ntypes, int* type, double* x_flat) {
+        double** x = new double*[natoms];
+        for (int i = 0; i < natoms; i++) {
+            x[i] = x_flat + i * 3;
+        }
+        pair->atom = new Atom(natoms, ntypes, type, x);
+        pair->result_F = new double[natoms * 3];
+    }
+
+    void pair_set_domain(PairD3* pair, int xperiodic, int yperiodic, int zperiodic, double* boxlo, double* boxhi, double xy, double xz, double yz) {
+        pair->domain = new Domain(xperiodic, yperiodic, zperiodic, boxlo, boxhi, xy, xz, yz);
+    }
+
+    void pair_run_settings(PairD3* pair, double rthr, double cnthr, const char* damp_name, const char* func_name) {
+        pair->settings(rthr, cnthr, damp_name, func_name);
+    }
+
+    void pair_run_coeff(PairD3* pair, int* atomic_numbers) {
+        pair->coeff(atomic_numbers);
+    }
+
+    void pair_run_compute(PairD3* pair) {
+        pair->compute();
+    }
+
+    double pair_get_energy(PairD3* pair) {
+        return pair->result_E;
+    }
+
+    double* pair_get_force(PairD3* pair) {
+        return pair->result_F;
+    }
+
+    double* pair_get_stress(PairD3* pair) {
+        return pair->result_S;
+    }
+
+    void pair_fin(PairD3* pair) {
+        //delete[] result_F;
+        delete pair;
+        //delete domain;
+        //delete atom;
+    }
+}
--- a/mace-bench/3rdparty/SevenNet/sevenn/pair_e3gnn/pair_d3_for_ase.h
+++ b/mace-bench/3rdparty/SevenNet/sevenn/pair_e3gnn/pair_d3_for_ase.h
+/*
+This code is a skeleton of the LAMMPS pair_style d3 accelerated by CUDA.
+All dependencies on LAMMPS have been removed.
+The input and output variables are named based on the LAMMPS variables.
+*/
+
+#ifndef LMP_PAIR_D3
+#define LMP_PAIR_D3
+
+#include <cmath>
+#include <string>
+#include <vector> // for 'element_table'
+#include <algorithm> // for 'atomic_number'
+#include <map>
+#include <unordered_map>
+#include <cuda_runtime.h>
+
+#include "pair_d3_pars.h"
+
+// Removed dependencies to STL
+// #include <stdlib.h> -> no more C style functions
+// #define _USE_MATH_DEFINES -> no predefined constants
+
+// Removed dependencies to LAMMPS
+// #include "pair.h"   -> removed, for construncting pair class.
+// #include "utils.h"  -> removed, some float converters.
+// #include "atom.h"   -> Atom class to replace it.
+// #include "domain.h" -> Domain class to replace it.
+// #include "error.h"  -> Error class to replace it.
+// #include "comm.h"       -> already no dependency
+// #include "neighbor.h"   -> already no dependency
+// #include "neigh_list.h" -> already no dependency
+// #include "memory.h"     -> already no dependency for CUDA version
+// #include "math_extra.h"            -> removed, dot and len3 operations.
+// #include "potential_file_reader.h" -> removed, PotentialFileReader
+
+/* --------- Fake class to replace 'LAMMPS' class --------- */
+class Atom {
+public:
+    int natoms;
+    int ntypes;
+    int* type;
+    double** x;
+    Atom(int natoms, int ntypes, int* type, double** x) :
+        natoms(natoms),
+        ntypes(ntypes),
+        type(type),
+        x(x) {}
+    ~Atom() {
+        //delete[] type;
+        //for (int i = 0; i < natoms; i++) {
+        //    delete[] x[i];
+        //}
+        //delete[] x;
+    }
+};
+
+class Domain {
+public:
+    int xperiodic, yperiodic, zperiodic;
+    double boxlo[3], boxhi[3];
+    double xy, xz, yz;
+    Domain(int xperiodic, int yperiodic, int zperiodic, double* boxlo, double* boxhi, double xy, double xz, double yz) :
+        xperiodic(xperiodic),
+        yperiodic(yperiodic),
+        zperiodic(zperiodic),
+        xy(xy),
+        xz(xz),
+        yz(yz) {
+        for (int i = 0; i < 3; i++) {
+            this->boxlo[i] = boxlo[i];
+            this->boxhi[i] = boxhi[i];
+        }
+    }
+    ~Domain() {
+    }
+};
+
+class Error {
+public:
+    void all(int flerr, const char* message) {
+        printf("Error: %s\n", message);
+    }
+    Error() {}
+    ~Error() {}
+};
+/* ------------------------------------------------------- */
+
+/* --------- Declaration of fake classes and variables --------- */
+#define FLERR 1
+//Error* error = nullptr;
+//
+//int allocated;
+//int** setflag;
+//double** cutsq;
+
+//Atom* atom = nullptr;
+//Domain* domain = nullptr;
+//
+//double result_E;
+//double* result_F = nullptr;
+//double result_S[6];
+
+class Pair {
+public:
+    int allocated;
+    Atom* atom;
+    Domain* domain;
+    double result_E;
+    double* result_F;
+    double result_S[6];
+    Error* error;
+
+    Pair()
+        : allocated(0), atom(nullptr), domain(nullptr), result_E(0.0), result_F(nullptr), error(nullptr) {
+        std::fill(std::begin(result_S), std::end(result_S), 0.0);
+    }
+
+    virtual ~Pair() {
+        if (result_F) {
+            delete[] result_F;
+            result_F = nullptr;
+        }
+        if (atom) {
+            delete atom;
+            atom = nullptr;
+        }
+        if (domain) {
+            delete domain;
+            domain = nullptr;
+        }
+        if (error) {
+            delete error;
+            error = nullptr;
+        }
+    }
+};
+/* -------------------------------------------------------------- */
+
+class PairD3 : public Pair {
+public:
+    PairD3();
+    ~PairD3();
+
+    void settings(double rthr, double cnthr, std::string damp_name, std::string func_name);
+    void coeff(int* atomic_number);
+    void compute();
+
+protected:
+    virtual void allocate();
+
+    /* ------- Read parameters ------- */
+    int find_atomic_number(std::string&);
+    int is_int_in_array(int*, int, int);
+    void read_r0ab(int*, int); // void read_r0ab(class LAMMPS*, char*, int*, int);
+    void get_limit_in_pars_array(int&, int&, int&, int&);
+    void read_c6ab(int*, int); // void read_c6ab(class LAMMPS*, char*, int*, int);
+
+    void setfuncpar_zero();
+    void setfuncpar_bj();
+    void setfuncpar_zerom();
+    void setfuncpar_bjm();
+    void setfuncpar();
+    /* ------- Read parameters ------- */
+
+    /* ------- Lattice information ------- */
+    void set_lattice_repetition_criteria(float, int*);
+    void set_lattice_vectors();
+    /* ------- Lattice information ------- */
+
+    /* ------- Initialize & Precalculate ------- */
+    void load_atom_info();
+    void precalculate_tau_array();
+    /* ------- Initialize & Precalculate ------- */
+
+    /* ------- Reallocate (when number of atoms changed) ------- */
+    void reallocate_arrays();
+    void reallocate_arrays_np1();
+    /* ------- Reallocate (when number of atoms changed) ------- */
+
+    /* ------- Coordination number ------- */
+    void get_coordination_number();
+    void get_dC6_dCNij();
+    /* ------- Coordination number ------- */
+
+    /* ------- Main workers ------- */
+    void get_forces_without_dC6_zero();
+    void get_forces_without_dC6_bj();
+    void get_forces_without_dC6_zerom();
+    void get_forces_without_dC6_bjm();
+    void get_forces_without_dC6();
+    void get_forces_with_dC6();
+    void update();
+    /* ------- Main workers ------- */
+
+    /*--------- Constants ---------*/
+    static constexpr int MAX_ELEM = 94;          // maximum of the element number
+    static constexpr int MAXC = 5;               // maximum coordination number references per element
+
+    static constexpr double AU_TO_ANG = 0.52917726; // conversion factors (atomic unit --> angstrom)
+    static constexpr double AU_TO_EV = 27.21138505; // conversion factors (atomic unit --> eV)
+
+    static constexpr float K1 = 16.0;              // global ad hoc parameters
+    static constexpr float K3 = -4.0;              // global ad hoc parameters
+    /*--------- Constants ---------*/
+
+    /*--------- Parameters to read ---------*/
+    int damping;
+    std::string functional;
+    float* r2r4 = nullptr;             // scale r4/r2 values of the atoms by sqrt(Z)
+    float* rcov = nullptr;             // covalent radii
+    int* mxc = nullptr;                // How large the grid for c6 interpolation
+    float** r0ab = nullptr;            // cut-off radii for all element pairs
+    float***** c6ab = nullptr;         // C6 for all element pairs
+    float rthr;                        // R^2 distance to cutoff for C calculation
+    float cnthr;                       // R^2 distance to cutoff for CN_calculation
+    float s6, s8, s18, rs6, rs8, rs18, alp, alp6, alp8, a1, a2; // parameters for D3
+    /*--------- Parameters to read ---------*/
+
+    /*--------- Lattice related values ---------*/
+    double* lat_v_1 = nullptr;           // lattice coordination vector
+    double* lat_v_2 = nullptr;           // lattice coordination vector
+    double* lat_v_3 = nullptr;           // lattice coordination vector
+    int* rep_vdw = nullptr;              // repetition of cell for calculating D3
+    int* rep_cn = nullptr;               // repetition of cell for calculating
+    double** sigma = nullptr;            // virial pressure on cell
+    /*--------- Lattice related values ---------*/
+
+    /*--------- Per-atom values/arrays ---------*/
+    double* cn = nullptr;               // Coordination numbers
+    float** x = nullptr;                // Positions
+    double** f = nullptr;               // Forces
+    double* dc6i = nullptr;             // dC6i(iat) saves dE_dsp/dCN(iat)
+    /*--------- Per-atom values/arrays ---------*/
+
+    /*--------- Per-pair values/arrays ---------*/
+    float* c6_ij_tot = nullptr;
+    float* dc6_iji_tot = nullptr;
+    float* dc6_ijj_tot = nullptr;
+    /*--------- Per-pair values/arrays ---------*/
+
+    /*---------- Global values ---------*/
+    int n_save;                         // to check whether the number of atoms has changed
+    int np1_save;                       // to check whether the number of types has changed
+    float disp_total;                   // Dispersion energy
+    /*---------- Global values ---------*/
+
+    /*--------- For loop over tau (translation of cell) ---------*/
+    float**** tau_vdw = nullptr;
+    float**** tau_cn = nullptr;
+    int* tau_idx_vdw = nullptr;
+    int* tau_idx_cn = nullptr;
+    int tau_idx_vdw_total_size;
+    int tau_idx_cn_total_size;
+    /*--------- For loop over tau (translation of cell) ---------*/
+
+    /*--------- For cuda memory transfer (pointerized) ---------*/
+    int *atomtype;
+    double *disp;
+    /*--------- For cuda memory transfer (pointerized) ---------*/
+};
+
+#endif // LMP_PAIR_D3
--- a/mace-bench/3rdparty/SevenNet/sevenn/pair_e3gnn/pair_d3_pars.h
+++ b/mace-bench/3rdparty/SevenNet/sevenn/pair_e3gnn/pair_d3_pars.h
--- a/mace-bench/3rdparty/SevenNet/sevenn/pair_e3gnn/pair_e3gnn.cpp
+++ b/mace-bench/3rdparty/SevenNet/sevenn/pair_e3gnn/pair_e3gnn.cpp
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://lammps.sandia.gov/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Yutack Park (SNU)
+------------------------------------------------------------------------- */
+
+#include <ATen/ops/from_blob.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/TensorOptions.h>
+#include <string>
+
+#include <torch/script.h>
+#include <torch/torch.h>
+
+#include "atom.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
+#include "memory.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor.h"
+
+#include "pair_e3gnn.h"
+
+using namespace LAMMPS_NS;
+
+#define INTEGER_TYPE torch::TensorOptions().dtype(torch::kInt64)
+#define FLOAT_TYPE torch::TensorOptions().dtype(torch::kFloat)
+
+PairE3GNN::PairE3GNN(LAMMPS *lmp) : Pair(lmp) {
+  // constructor
+  const char *print_flag = std::getenv("SEVENN_PRINT_INFO");
+  if (print_flag)
+    print_info = true;
+
+  std::string device_name;
+  if (torch::cuda::is_available()) {
+    device = torch::kCUDA;
+    device_name = "CUDA";
+  } else {
+    device = torch::kCPU;
+    device_name = "CPU";
+  }
+
+  if (lmp->logfile) {
+    fprintf(lmp->logfile, "PairE3GNN using device : %s\n", device_name.c_str());
+  }
+}
+
+PairE3GNN::~PairE3GNN() {
+  if (allocated) {
+    memory->destroy(setflag);
+    memory->destroy(cutsq);
+    memory->destroy(map);
+    memory->destroy(elements);
+  }
+}
+
+void PairE3GNN::compute(int eflag, int vflag) {
+  // compute
+  /*
+     This compute function is ispired/modified from stress branch of pair-nequip
+     https://github.com/mir-group/pair_nequip
+  */
+
+  if (eflag || vflag)
+    ev_setup(eflag, vflag);
+  else
+    evflag = vflag_fdotr = 0;
+  if (vflag_atom) {
+    error->all(FLERR, "atomic stress is not supported\n");
+  }
+
+  int nlocal = list->inum; // same as nlocal
+  int *ilist = list->ilist;
+  tagint *tag = atom->tag;
+  std::unordered_map<int, int> tag_map;
+
+  if (atom->tag_consecutive() == 0) {
+    for (int ii = 0; ii < nlocal; ii++) {
+      const int i = ilist[ii];
+      int itag = tag[i];
+      tag_map[itag] = ii+1;
+      // printf("MODIFY setting %i => %i \n",itag, tag_map[itag] );
+    }
+  } else {
+    //Ordered which mappling required
+    for (int ii = 0; ii < nlocal; ii++) {
+        const int itag = ilist[ii]+1;
+        tag_map[itag] = ii+1;
+        // printf("normal setting %i => %i \n",itag, tag_map[itag] );
+    }
+  }
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+  long num_atoms[1] = {nlocal};
+
+  int tag2i[nlocal];
+
+  int *numneigh = list->numneigh;      // j loop cond
+  int **firstneigh = list->firstneigh; // j list
+
+  int bound;
+  if (this->nedges_bound == -1) {
+    bound = std::accumulate(numneigh, numneigh + nlocal, 0);
+  } else {
+    bound = this->nedges_bound;
+  }
+  const int nedges_upper_bound = bound;
+
+  float cell[3][3];
+  cell[0][0] = domain->boxhi[0] - domain->boxlo[0];
+  cell[0][1] = 0.0;
+  cell[0][2] = 0.0;
+
+  cell[1][0] = domain->xy;
+  cell[1][1] = domain->boxhi[1] - domain->boxlo[1];
+  cell[1][2] = 0.0;
+
+  cell[2][0] = domain->xz;
+  cell[2][1] = domain->yz;
+  cell[2][2] = domain->boxhi[2] - domain->boxlo[2];
+
+  torch::Tensor inp_cell = torch::from_blob(cell, {3, 3}, FLOAT_TYPE);
+  torch::Tensor inp_num_atoms = torch::from_blob(num_atoms, {1}, INTEGER_TYPE);
+
+  torch::Tensor inp_node_type = torch::zeros({nlocal}, INTEGER_TYPE);
+  torch::Tensor inp_pos = torch::zeros({nlocal, 3});
+
+  torch::Tensor inp_cell_volume =
+      torch::dot(inp_cell[0], torch::cross(inp_cell[1], inp_cell[2], 0));
+
+  float pbc_shift_tmp[nedges_upper_bound][3];
+
+  auto node_type = inp_node_type.accessor<long, 1>();
+  auto pos = inp_pos.accessor<float, 2>();
+
+  long edge_idx_src[nedges_upper_bound];
+  long edge_idx_dst[nedges_upper_bound];
+
+  int nedges = 0;
+
+  for (int ii = 0; ii < nlocal; ii++) {
+    const int i = ilist[ii];
+    int itag = tag_map[tag[i]];
+    tag2i[itag - 1] = i;
+    const int itype = type[i];
+    node_type[itag - 1] = map[itype];
+    pos[itag - 1][0] = x[i][0];
+    pos[itag - 1][1] = x[i][1];
+    pos[itag - 1][2] = x[i][2];
+  }
+
+  for (int ii = 0; ii < nlocal; ii++) {
+    const int i = ilist[ii];
+    int itag = tag_map[tag[i]];
+    const int *jlist = firstneigh[i];
+    const int jnum = numneigh[i];
+
+    for (int jj = 0; jj < jnum; jj++) {
+      int j = jlist[jj]; // atom over pbc is different atom
+      int jtag = tag_map[tag[j]]; // atom over pbs is same atom (it starts from 1)
+      j &= NEIGHMASK;
+      const int jtype = type[j];
+
+      const double delij[3] = {x[j][0] - x[i][0], x[j][1] - x[i][1],
+                               x[j][2] - x[i][2]};
+      const double Rij =
+          delij[0] * delij[0] + delij[1] * delij[1] + delij[2] * delij[2];
+      if (Rij < cutoff_square) {
+        edge_idx_src[nedges] = itag - 1;
+        edge_idx_dst[nedges] = jtag - 1;
+
+        pbc_shift_tmp[nedges][0] = x[j][0] - pos[jtag - 1][0];
+        pbc_shift_tmp[nedges][1] = x[j][1] - pos[jtag - 1][1];
+        pbc_shift_tmp[nedges][2] = x[j][2] - pos[jtag - 1][2];
+
+        nedges++;
+      }
+    } // j loop end
+  }   // i loop end
+
+  auto edge_idx_src_tensor =
+      torch::from_blob(edge_idx_src, {nedges}, INTEGER_TYPE);
+  auto edge_idx_dst_tensor =
+      torch::from_blob(edge_idx_dst, {nedges}, INTEGER_TYPE);
+  auto inp_edge_index =
+      torch::stack({edge_idx_src_tensor, edge_idx_dst_tensor});
+
+  // r' = r + {shift_tensor(integer vector of len 3)} @ cell_tensor
+  // shift_tensor = (cell_tensor)^-1^T @ (r' - r)
+  torch::Tensor cell_inv_tensor =
+      inp_cell.inverse().transpose(0, 1).unsqueeze(0).to(device);
+  torch::Tensor pbc_shift_tmp_tensor =
+      torch::from_blob(pbc_shift_tmp, {nedges, 3}, FLOAT_TYPE)
+          .view({nedges, 3, 1})
+          .to(device);
+  torch::Tensor inp_cell_shift =
+      torch::bmm(cell_inv_tensor.expand({nedges, 3, 3}), pbc_shift_tmp_tensor)
+          .view({nedges, 3});
+
+  inp_pos.set_requires_grad(true);
+
+  c10::Dict<std::string, torch::Tensor> input_dict;
+  input_dict.insert("x", inp_node_type.to(device));
+  input_dict.insert("pos", inp_pos.to(device));
+  input_dict.insert("edge_index", inp_edge_index.to(device));
+  input_dict.insert("num_atoms", inp_num_atoms.to(device));
+  input_dict.insert("cell_lattice_vectors", inp_cell.to(device));
+  input_dict.insert("cell_volume", inp_cell_volume.to(device));
+  input_dict.insert("pbc_shift", inp_cell_shift);
+
+  std::vector<torch::IValue> input(1, input_dict);
+  auto output = model.forward(input).toGenericDict();
+
+  torch::Tensor total_energy_tensor =
+      output.at("inferred_total_energy").toTensor().cpu();
+  torch::Tensor force_tensor = output.at("inferred_force").toTensor().cpu();
+  auto forces = force_tensor.accessor<float, 2>();
+  eng_vdwl += total_energy_tensor.item<float>();
+
+  for (int itag = 0; itag < nlocal; itag++) {
+    int i = tag2i[itag];
+    f[i][0] += forces[itag][0];
+    f[i][1] += forces[itag][1];
+    f[i][2] += forces[itag][2];
+  }
+
+  if (vflag) {
+    // more accurately, it is virial part of stress
+    torch::Tensor stress_tensor = output.at("inferred_stress").toTensor().cpu();
+    auto virial_stress_tensor = stress_tensor * inp_cell_volume;
+    // xy yz zx order in vasp (voigt is xx yy zz yz xz xy)
+    auto virial_stress = virial_stress_tensor.accessor<float, 1>();
+    virial[0] += virial_stress[0];
+    virial[1] += virial_stress[1];
+    virial[2] += virial_stress[2];
+    virial[3] += virial_stress[3];
+    virial[4] += virial_stress[5];
+    virial[5] += virial_stress[4];
+  }
+
+  if (eflag_atom) {
+    torch::Tensor atomic_energy_tensor =
+        output.at("atomic_energy").toTensor().cpu().squeeze();
+    auto atomic_energy = atomic_energy_tensor.accessor<float, 1>();
+    for (int itag = 0; itag < nlocal; itag++) {
+      int i = tag2i[itag];
+      eatom[i] += atomic_energy[itag];
+    }
+  }
+
+  // if it was the first MD step
+  if (this->nedges_bound == -1) {
+    this->nedges_bound = nedges * 1.2;
+  } // else if the nedges is too small, increase the bound
+  else if (nedges > this->nedges_bound / 1.2) {
+    this->nedges_bound = nedges * 1.2;
+  }
+}
+
+// allocate arrays (called from coeff)
+void PairE3GNN::allocate() {
+  allocated = 1;
+  int n = atom->ntypes;
+
+  memory->create(setflag, n + 1, n + 1, "pair:setflag");
+  memory->create(cutsq, n + 1, n + 1, "pair:cutsq");
+  memory->create(map, n + 1, "pair:map");
+}
+
+// global settings for pair_style
+void PairE3GNN::settings(int narg, char **arg) {
+  if (narg != 0) {
+    error->all(FLERR, "Illegal pair_style command");
+  }
+}
+
+void PairE3GNN::coeff(int narg, char **arg) {
+
+  if (allocated) {
+    error->all(FLERR, "pair_e3gnn coeff called twice");
+  }
+  allocate();
+
+  if (strcmp(arg[0], "*") != 0 || strcmp(arg[1], "*") != 0) {
+    error->all(FLERR,
+               "e3gnn: first and second input of pair_coeff should be '*'");
+  }
+  // expected input : pair_coeff * * pot.pth type_name1 type_name2 ...
+
+  std::unordered_map<std::string, std::string> meta_dict = {
+      {"chemical_symbols_to_index", ""},
+      {"cutoff", ""},
+      {"num_species", ""},
+      {"model_type", ""},
+      {"version", ""},
+      {"dtype", ""},
+      {"time", ""}};
+
+  // model loading from input
+  try {
+    model = torch::jit::load(std::string(arg[2]), device, meta_dict);
+  } catch (const c10::Error &e) {
+    error->all(FLERR, "error loading the model, check the path of the model");
+  }
+  // model = torch::jit::freeze(model); model is already freezed
+
+  torch::jit::setGraphExecutorOptimize(false);
+  torch::jit::FusionStrategy strategy;
+  // thing about dynamic recompile as tensor shape varies, this is default
+  // strategy = {{torch::jit::FusionBehavior::DYNAMIC, 3}};
+  strategy = {{torch::jit::FusionBehavior::STATIC, 0}};
+  torch::jit::setFusionStrategy(strategy);
+
+  cutoff = std::stod(meta_dict["cutoff"]);
+  cutoff_square = cutoff * cutoff;
+
+  if (meta_dict["model_type"].compare("E3_equivariant_model") != 0) {
+    error->all(FLERR, "given model type is not E3_equivariant_model");
+  }
+
+  std::string chem_str = meta_dict["chemical_symbols_to_index"];
+  int ntypes = atom->ntypes;
+
+  auto delim = " ";
+  char *tok = std::strtok(const_cast<char *>(chem_str.c_str()), delim);
+  std::vector<std::string> chem_vec;
+  while (tok != nullptr) {
+    chem_vec.push_back(std::string(tok));
+    tok = std::strtok(nullptr, delim);
+  }
+
+  bool found_flag = false;
+  for (int i = 3; i < narg; i++) {
+    found_flag = false;
+    for (int j = 0; j < chem_vec.size(); j++) {
+      if (chem_vec[j].compare(arg[i]) == 0) {
+        map[i - 2] = j;
+        found_flag = true;
+        fprintf(lmp->logfile, "Chemical specie '%s' is assigned to type %d\n",
+                arg[i], i - 2);
+        break;
+      }
+    }
+    if (!found_flag) {
+      error->all(FLERR, "Unknown chemical specie is given");
+    }
+  }
+
+  if (ntypes > narg - 3) {
+    error->all(FLERR, "Not enough chemical specie is given. Check pair_coeff "
+                      "and types in your data/script");
+  }
+
+  for (int i = 1; i <= ntypes; i++) {
+    for (int j = 1; j <= ntypes; j++) {
+      if ((map[i] >= 0) && (map[j] >= 0)) {
+        setflag[i][j] = 1;
+        cutsq[i][j] = cutoff * cutoff;
+      }
+    }
+  }
+
+  if (lmp->logfile) {
+    fprintf(lmp->logfile, "from sevenn version '%s' ",
+            meta_dict["version"].c_str());
+    fprintf(lmp->logfile, "%s precision model, deployed when: %s\n",
+            meta_dict["dtype"].c_str(), meta_dict["time"].c_str());
+  }
+}
+
+// init specific to this pair
+void PairE3GNN::init_style() {
+  // Newton flag is irrelevant if use only one processor for simulation
+  /*
+  if (force->newton_pair == 0) {
+    error->all(FLERR, "Pair style nn requires newton pair on");
+  }
+  */
+
+  // full neighbor list (this is many-body potential)
+  neighbor->add_request(this, NeighConst::REQ_FULL);
+}
+
+double PairE3GNN::init_one(int i, int j) { return cutoff; }
--- a/mace-bench/3rdparty/SevenNet/sevenn/pair_e3gnn/pair_e3gnn.h
+++ b/mace-bench/3rdparty/SevenNet/sevenn/pair_e3gnn/pair_e3gnn.h
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+http://lammps.sandia.gov, Sandia National Laboratories
+Steve Plimpton, sjplimp@sandia.gov
+
+Copyright (2003) Sandia Corporation.  Under the terms of Contract
+DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+certain rights in this software.  This software is distributed under
+the GNU General Public License.
+
+See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+PairStyle(e3gnn, PairE3GNN)
+
+#else
+
+#ifndef LMP_PAIR_E3GNN
+#define LMP_PAIR_E3GNN
+
+#include "pair.h"
+
+#include <torch/torch.h>
+
+namespace LAMMPS_NS {
+class PairE3GNN : public Pair {
+private:
+  double cutoff;
+  double cutoff_square;
+  torch::jit::Module model;
+  torch::Device device = torch::kCPU;
+  int nelements;
+  bool print_info = false;
+
+  int nedges_bound = -1;
+
+public:
+  PairE3GNN(class LAMMPS *);
+  ~PairE3GNN();
+  void compute(int, int);
+
+  void settings(int, char **);
+  // read Atom type string from input script & related coeff
+  void coeff(int, char **);
+  void allocate();
+
+  void init_style();
+  double init_one(int, int);
+};
+} // namespace LAMMPS_NS
+
+#endif
+#endif
--- a/mace-bench/3rdparty/SevenNet/sevenn/pair_e3gnn/pair_e3gnn_parallel.cpp
+++ b/mace-bench/3rdparty/SevenNet/sevenn/pair_e3gnn/pair_e3gnn_parallel.cpp
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://lammps.sandia.gov/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Yutack Park (SNU)
+------------------------------------------------------------------------- */
+
+#include <ATen/core/Dict.h>
+#include <ATen/core/ivalue_inl.h>
+#include <ATen/ops/from_blob.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/TensorOptions.h>
+#include <cstdlib>
+#include <filesystem>
+#include <numeric>
+#include <string>
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/script.h>
+#include <torch/torch.h>
+
+#include <cuda_runtime.h>
+
+#include "atom.h"
+#include "comm.h"
+#include "comm_brick.h"
+#include "error.h"
+#include "force.h"
+#include "memory.h"
+#include "neigh_list.h"
+#include "neighbor.h"
+// #include "nvToolsExt.h"
+
+#include "pair_e3gnn_parallel.h"
+#include <cassert>
+
+#ifdef OMPI_MPI_H
+#include "mpi-ext.h" //This should be included after mpi.h which is included in pair.h
+#endif
+
+using namespace LAMMPS_NS;
+
+#define INTEGER_TYPE torch::TensorOptions().dtype(torch::kInt64)
+#define FLOAT_TYPE torch::TensorOptions().dtype(torch::kFloat)
+
+DeviceBuffManager &DeviceBuffManager::getInstance() {
+  static DeviceBuffManager instance;
+  return instance;
+}
+
+void DeviceBuffManager::get_buffer(int send_size, int recv_size,
+                                   float *&buf_send_ptr, float *&buf_recv_ptr) {
+  if (send_size > send_buf_size) {
+    cudaFree(buf_send_device);
+    cudaError_t cuda_err =
+        cudaMalloc(&buf_send_device, send_size * sizeof(float));
+    send_buf_size = send_size;
+  }
+  if (recv_size > recv_buf_size) {
+    cudaFree(buf_recv_device);
+    cudaError_t cuda_err =
+        cudaMalloc(&buf_recv_device, recv_size * sizeof(float));
+    recv_buf_size = recv_size;
+  }
+  buf_send_ptr = buf_send_device;
+  buf_recv_ptr = buf_recv_device;
+}
+
+DeviceBuffManager::~DeviceBuffManager() {
+  cudaFree(buf_send_device);
+  cudaFree(buf_recv_device);
+}
+
+PairE3GNNParallel::PairE3GNNParallel(LAMMPS *lmp) : Pair(lmp) {
+  // constructor
+
+  const char *print_flag = std::getenv("SEVENN_PRINT_INFO");
+  const char *print_both_flag = std::getenv("SEVENN_PRINT_BOTH_INFO");
+  if (print_flag) {
+    world_rank = comm->me;
+    std::cout << "process rank: " << world_rank << " initialized" << std::endl;
+    print_info = (world_rank == 0) || print_both_flag;
+  }
+
+  std::string device_name;
+  const bool use_gpu = torch::cuda::is_available();
+
+  comm_forward = 0;
+  comm_reverse = 0;
+
+  // OpenMPI detection
+#ifdef OMPI_MPI_H
+#if defined(MPIX_CUDA_AWARE_SUPPORT)
+  if (1 == MPIX_Query_cuda_support()) {
+    use_cuda_mpi = true;
+  } else {
+    use_cuda_mpi = false;
+  }
+#else
+  use_cuda_mpi = false;
+#endif
+#else
+  use_cuda_mpi = false;
+#endif
+  // use_cuda_mpi = use_gpu && use_cuda_mpi;
+  // if (use_cuda_mpi) {
+  if (use_gpu) {
+    device = get_cuda_device();
+    device_name = "CUDA";
+  } else {
+    device = torch::kCPU;
+    device_name = "CPU";
+  }
+
+  if (std::getenv("OFF_E3GNN_PARALLEL_CUDA_MPI")) {
+      use_cuda_mpi = false;
+  }
+
+  if (lmp->screen) {
+    if (use_gpu && !use_cuda_mpi) {
+      device_comm = torch::kCPU;
+      fprintf(lmp->screen,
+              "cuda-aware mpi not found, communicate via host device\n");
+    } else {
+      device_comm = device;
+    }
+    fprintf(lmp->screen, "PairE3GNNParallel using device : %s\n",
+            device_name.c_str());
+    fprintf(lmp->screen, "PairE3GNNParallel cuda-aware mpi: %s\n",
+            use_cuda_mpi ? "True" : "False");
+  }
+  if (lmp->logfile) {
+    if (use_gpu && !use_cuda_mpi) {
+      device_comm = torch::kCPU;
+      fprintf(lmp->logfile,
+              "cuda-aware mpi not found, communicate via host device\n");
+    } else {
+      device_comm = device;
+    }
+    fprintf(lmp->logfile, "PairE3GNNParallel using device : %s\n",
+            device_name.c_str());
+    fprintf(lmp->logfile, "PairE3GNNParallel cuda-aware mpi: %s\n",
+            use_cuda_mpi ? "True" : "False");
+  }
+}
+
+torch::Device PairE3GNNParallel::get_cuda_device() {
+  char *cuda_visible = std::getenv("CUDA_VISIBLE_DEVICES");
+  int num_gpus;
+  int idx;
+  int rank = comm->me;
+  num_gpus = torch::cuda::device_count();
+  idx = rank % num_gpus;
+  if (print_info)
+    std::cout << world_rank << " Available # of GPUs found: " << num_gpus
+              << std::endl;
+  cudaError_t cuda_err = cudaSetDevice(idx);
+  if (cuda_err != cudaSuccess) {
+    std::cerr << "E3GNN: Failed to set CUDA device: "
+              << cudaGetErrorString(cuda_err) << std::endl;
+  }
+  return torch::Device(torch::kCUDA, idx);
+}
+
+PairE3GNNParallel::~PairE3GNNParallel() {
+  if (allocated) {
+    memory->destroy(setflag);
+    memory->destroy(cutsq);
+    memory->destroy(map);
+  }
+}
+
+int PairE3GNNParallel::get_x_dim() { return x_dim; }
+
+bool PairE3GNNParallel::use_cuda_mpi_() { return use_cuda_mpi; }
+
+bool PairE3GNNParallel::is_comm_preprocess_done() {
+  return comm_preprocess_done;
+}
+
+void PairE3GNNParallel::compute(int eflag, int vflag) {
+  /*
+     Graph build on cpu
+  */
+  if (eflag || vflag)
+    ev_setup(eflag, vflag);
+  else
+    evflag = vflag_fdotr = 0;
+  if (vflag_atom) {
+    error->all(FLERR, "atomic stress is not supported\n");
+  }
+
+  if (atom->tag_consecutive() == 0) {
+    error->all(FLERR, "Pair e3gnn requires consecutive atom IDs");
+  }
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+  int nlocal = list->inum; // same as nlocal
+  int nghost = atom->nghost;
+  int ntotal = nlocal + nghost;
+  int *ilist = list->ilist;
+  int inum = list->inum;
+
+  CommBrick *comm_brick = dynamic_cast<CommBrick *>(comm);
+  if (comm_brick == nullptr) {
+    error->all(FLERR, "e3gnn/parallel: comm style should be brick & from "
+                      "modified code of comm_brick");
+  }
+
+  bigint natoms = atom->natoms;
+
+  // tag ignore PBC
+  tagint *tag = atom->tag;
+
+  // store graph_idx from local to known ghost atoms(ghost atoms inside cutoff)
+  int tag_to_graph_idx[natoms + 1]; // tag starts from 1 not 0
+  std::fill_n(tag_to_graph_idx, natoms + 1, -1);
+
+  // to access tag_to_graph_idx from comm
+  tag_to_graph_idx_ptr = tag_to_graph_idx;
+
+  int graph_indexer = nlocal;
+  int graph_index_to_i[ntotal];
+
+  int *numneigh = list->numneigh;      // j loop cond
+  int **firstneigh = list->firstneigh; // j list
+  const int nedges_upper_bound =
+      std::accumulate(numneigh, numneigh + nlocal, 0);
+
+  std::vector<long> node_type;
+  std::vector<long> node_type_ghost;
+
+  float edge_vec[nedges_upper_bound][3];
+  long edge_idx_src[nedges_upper_bound];
+  long edge_idx_dst[nedges_upper_bound];
+
+  int nedges = 0;
+  for (int ii = 0; ii < inum; ii++) {
+    // populate tag_to_graph_idx of local atoms
+    const int i = ilist[ii];
+    const int itag = tag[i];
+    const int itype = type[i];
+    tag_to_graph_idx[itag] = ii;
+    graph_index_to_i[ii] = i;
+    node_type.push_back(map[itype]);
+  }
+
+  // loop over neighbors, build graph
+  for (int ii = 0; ii < inum; ii++) {
+    const int i = ilist[ii];
+    const int i_graph_idx = ii;
+    const int *jlist = firstneigh[i];
+    const int jnum = numneigh[i];
+
+    for (int jj = 0; jj < jnum; jj++) {
+      int j = jlist[jj];
+      const int jtag = tag[j];
+      j &= NEIGHMASK;
+      const int jtype = type[j];
+      // we have to calculate Rij to check cutoff in lammps side
+      const double delij[3] = {x[j][0] - x[i][0], x[j][1] - x[i][1],
+                               x[j][2] - x[i][2]};
+      const double Rij =
+          delij[0] * delij[0] + delij[1] * delij[1] + delij[2] * delij[2];
+
+      int j_graph_idx;
+      if (Rij < cutoff_square) {
+        // if given j is not local atom and inside cutoff
+        if (tag_to_graph_idx[jtag] == -1) {
+          // if j is ghost atom inside cutoff but first seen
+          tag_to_graph_idx[jtag] = graph_indexer;
+          graph_index_to_i[graph_indexer] = j;
+          node_type_ghost.push_back(map[jtype]);
+          graph_indexer++;
+        }
+
+        j_graph_idx = tag_to_graph_idx[jtag];
+        edge_idx_src[nedges] = i_graph_idx;
+        edge_idx_dst[nedges] = j_graph_idx;
+        edge_vec[nedges][0] = delij[0];
+        edge_vec[nedges][1] = delij[1];
+        edge_vec[nedges][2] = delij[2];
+        nedges++;
+      }
+    } // j loop end
+  }   // i loop end
+
+  // member variable
+  graph_size = graph_indexer;
+  const int ghost_node_num = graph_size - nlocal;
+
+  // convert data to Tensor
+  auto inp_node_type = torch::from_blob(node_type.data(), nlocal, INTEGER_TYPE);
+  auto inp_node_type_ghost =
+      torch::from_blob(node_type_ghost.data(), ghost_node_num, INTEGER_TYPE);
+
+  long num_nodes[1] = {long(nlocal)};
+  auto inp_num_atoms = torch::from_blob(num_nodes, {1}, INTEGER_TYPE);
+
+  auto edge_idx_src_tensor =
+      torch::from_blob(edge_idx_src, {nedges}, INTEGER_TYPE);
+  auto edge_idx_dst_tensor =
+      torch::from_blob(edge_idx_dst, {nedges}, INTEGER_TYPE);
+  auto inp_edge_index =
+      torch::stack({edge_idx_src_tensor, edge_idx_dst_tensor});
+
+  auto inp_edge_vec = torch::from_blob(edge_vec, {nedges, 3}, FLOAT_TYPE);
+  if (print_info) {
+    std::cout << world_rank << " Nlocal: " << nlocal << std::endl;
+    std::cout << world_rank << " Graph_size: " << graph_size << std::endl;
+    std::cout << world_rank << " Ghost_node_num: " << ghost_node_num
+              << std::endl;
+    std::cout << world_rank << " Nedges: " << nedges << "\n" << std::endl;
+  }
+
+  // r_original requires grad True
+  inp_edge_vec.set_requires_grad(true);
+
+  torch::Dict<std::string, torch::Tensor> input_dict;
+  input_dict.insert("x", inp_node_type.to(device));
+  input_dict.insert("x_ghost", inp_node_type_ghost.to(device));
+  input_dict.insert("edge_index", inp_edge_index.to(device));
+  input_dict.insert("edge_vec", inp_edge_vec.to(device));
+  input_dict.insert("num_atoms", inp_num_atoms.to(device));
+  input_dict.insert("nlocal", inp_num_atoms.to(torch::kCPU));
+
+  std::list<std::vector<torch::Tensor>> wrt_tensors;
+  wrt_tensors.push_back({input_dict.at("edge_vec")});
+
+  auto model_part = model_list.front();
+
+  auto output = model_part.forward({input_dict}).toGenericDict();
+
+  comm_preprocess();
+
+  // extra_graph_idx_map is set from comm_preprocess();
+  // last one is for trash values. See pack_forward_init
+  const int extra_size =
+      ghost_node_num + static_cast<int>(extra_graph_idx_map.size()) + 1;
+  torch::Tensor x_local;
+  torch::Tensor x_ghost;
+
+  for (auto it = model_list.begin(); it != model_list.end(); ++it) {
+    if (it == model_list.begin())
+      continue;
+    model_part = *it;
+
+    x_local = output.at("x").toTensor().detach().to(device);
+    x_dim = x_local.size(1); // length of per atom vector(node feature)
+
+    auto ghost_and_extra_x = torch::zeros({ghost_node_num + extra_size, x_dim},
+                                          FLOAT_TYPE.device(device));
+    x_comm = torch::cat({x_local, ghost_and_extra_x}, 0).to(device_comm);
+    comm_brick->forward_comm(this); // populate x_ghost by communication
+
+    // What we got from forward_comm (node feature of ghosts)
+    x_ghost = torch::split_with_sizes(
+        x_comm, {nlocal, ghost_node_num, extra_size}, 0)[1];
+    x_ghost.set_requires_grad(true);
+
+    // prepare next input (output > next input)
+    output.insert_or_assign("x_ghost", x_ghost.to(device));
+    // make another edge_vec to discriminate grad calculation with other
+    // edge_vecs(maybe redundant?)
+    output.insert_or_assign("edge_vec",
+                            output.at("edge_vec").toTensor().clone());
+
+    // save tensors for backprop
+    wrt_tensors.push_back({output.at("edge_vec").toTensor(),
+                           output.at("x").toTensor(),
+                           output.at("self_cont_tmp").toTensor(),
+                           output.at("x_ghost").toTensor()});
+
+    output = model_part.forward({output}).toGenericDict();
+  }
+  torch::Tensor energy_tensor =
+      output.at("inferred_total_energy").toTensor().squeeze();
+
+  torch::Tensor dE_dr =
+      torch::zeros({nedges, 3}, FLOAT_TYPE.device(device)); // create on device
+  torch::Tensor x_local_save; // holds grad info of x_local (it loses its grad
+                              // when sends to CPU)
+  torch::Tensor self_conn_grads;
+  std::vector<torch::Tensor> grads;
+  std::vector<torch::Tensor> of_tensor;
+
+  // TODO: most values of self_conn_grads were zero because we use only scalars
+  // for energy
+  for (auto rit = wrt_tensors.rbegin(); rit != wrt_tensors.rend(); ++rit) {
+    // edge_vec, x, x_ghost order
+    auto wrt_tensor = *rit;
+    if (rit == wrt_tensors.rbegin()) {
+      grads = torch::autograd::grad({energy_tensor}, wrt_tensor);
+    } else {
+      x_local_save.copy_(x_local);
+      //                            of         wrt         grads_output
+      grads = torch::autograd::grad(of_tensor, wrt_tensor,
+                                    {x_local_save, self_conn_grads});
+    }
+
+    dE_dr = dE_dr + grads.at(0); // accumulate force
+    if (std::distance(rit, wrt_tensors.rend()) == 1)
+      continue; // if last iteration
+
+    of_tensor.clear();
+    of_tensor.push_back(wrt_tensor[1]); // x
+    of_tensor.push_back(wrt_tensor[2]); // self_cont_tmp
+
+    x_local_save = grads.at(1);      // for grads_output
+    x_local = x_local_save.detach(); // grad_outputs & communication
+    x_dim = x_local.size(1);
+
+    self_conn_grads = grads.at(2); // no communication, for grads_output
+
+    x_ghost = grads.at(3).detach(); // yes communication, not for grads_output
+
+    auto extra_x = torch::zeros({extra_size, x_dim}, FLOAT_TYPE.device(device));
+    x_comm = torch::cat({x_local, x_ghost, extra_x}, 0).to(device_comm);
+
+    comm_brick->reverse_comm(this); // completes x_local
+
+    // now x_local is complete (dE_dx), become next grads_output(with
+    // self_conn_grads)
+    x_local = torch::split_with_sizes(
+        x_comm, {nlocal, ghost_node_num, extra_size}, 0)[0];
+  }
+
+  // postprocessing
+  if (print_info) {
+    size_t free, tot;
+    cudaMemGetInfo(&free, &tot);
+    std::cout << world_rank << " MEM use after backward(MB)" << std::endl;
+    double Mfree = static_cast<double>(free) / (1024 * 1024);
+    double Mtot = static_cast<double>(tot) / (1024 * 1024);
+    std::cout << world_rank << " Total: " << Mtot << std::endl;
+    std::cout << world_rank << " Free: " << Mfree << std::endl;
+    std::cout << world_rank << " Used: " << Mtot - Mfree << std::endl;
+    double Mused = Mtot - Mfree;
+    std::cout << world_rank << " Used/Nedges: " << Mused / nedges << std::endl;
+    std::cout << world_rank << " Used/Nlocal: " << Mused / nlocal << std::endl;
+    std::cout << world_rank << " Used/GraphSize: " << Mused / graph_size << "\n"
+              << std::endl;
+  }
+  eng_vdwl += energy_tensor.item<float>(); // accumulate energy
+
+  dE_dr = dE_dr.to(torch::kCPU);
+  torch::Tensor force_tensor = torch::zeros({graph_indexer, 3});
+
+  auto _edge_idx_src_tensor =
+      edge_idx_src_tensor.repeat_interleave(3).view({nedges, 3});
+  auto _edge_idx_dst_tensor =
+      edge_idx_dst_tensor.repeat_interleave(3).view({nedges, 3});
+
+  force_tensor.scatter_reduce_(0, _edge_idx_src_tensor, dE_dr, "sum");
+  force_tensor.scatter_reduce_(0, _edge_idx_dst_tensor, torch::neg(dE_dr),
+                               "sum");
+
+  auto forces = force_tensor.accessor<float, 2>();
+
+  for (int graph_idx = 0; graph_idx < graph_indexer; graph_idx++) {
+    int i = graph_index_to_i[graph_idx];
+    f[i][0] += forces[graph_idx][0];
+    f[i][1] += forces[graph_idx][1];
+    f[i][2] += forces[graph_idx][2];
+  }
+
+  if (vflag) {
+    auto diag = inp_edge_vec * dE_dr;
+    auto s12 = inp_edge_vec.select(1, 0) * dE_dr.select(1, 1);
+    auto s23 = inp_edge_vec.select(1, 1) * dE_dr.select(1, 2);
+    auto s31 = inp_edge_vec.select(1, 2) * dE_dr.select(1, 0);
+    std::vector<torch::Tensor> voigt_list = {
+        diag, s12.unsqueeze(-1), s23.unsqueeze(-1), s31.unsqueeze(-1)};
+    auto voigt = torch::cat(voigt_list, 1);
+
+    torch::Tensor per_atom_stress_tensor = torch::zeros({graph_indexer, 6});
+    auto _edge_idx_dst6_tensor =
+        edge_idx_dst_tensor.repeat_interleave(6).view({nedges, 6});
+    per_atom_stress_tensor.scatter_reduce_(0, _edge_idx_dst6_tensor, voigt,
+                                           "sum");
+    auto virial_stress_tensor =
+        torch::neg(torch::sum(per_atom_stress_tensor, 0));
+    auto virial_stress = virial_stress_tensor.accessor<float, 1>();
+
+    virial[0] += virial_stress[0];
+    virial[1] += virial_stress[1];
+    virial[2] += virial_stress[2];
+    virial[3] += virial_stress[3];
+    virial[4] += virial_stress[5];
+    virial[5] += virial_stress[4];
+  }
+
+  if (eflag_atom) {
+    torch::Tensor atomic_energy_tensor =
+        output.at("atomic_energy").toTensor().cpu().squeeze();
+    auto atomic_energy = atomic_energy_tensor.accessor<float, 1>();
+    for (int graph_idx = 0; graph_idx < nlocal; graph_idx++) {
+      int i = graph_index_to_i[graph_idx];
+      eatom[i] += atomic_energy[graph_idx];
+    }
+  }
+
+  // clean up comm preprocess variables
+  comm_preprocess_done = false;
+  for (int i = 0; i < 6; i++) {
+    // array of vector<long>
+    comm_index_pack_forward[i].clear();
+    comm_index_unpack_forward[i].clear();
+    comm_index_unpack_reverse[i].clear();
+  }
+
+  extra_graph_idx_map.clear();
+}
+
+// allocate arrays (called from coeff)
+void PairE3GNNParallel::allocate() {
+  allocated = 1;
+  int n = atom->ntypes;
+
+  memory->create(setflag, n + 1, n + 1, "pair:setflag");
+  memory->create(cutsq, n + 1, n + 1, "pair:cutsq");
+  memory->create(map, n + 1, "pair:map");
+}
+
+// global settings for pair_style
+void PairE3GNNParallel::settings(int narg, char **arg) {
+  if (narg != 0) {
+    error->all(FLERR, "Illegal pair_style command");
+  }
+}
+
+void PairE3GNNParallel::coeff(int narg, char **arg) {
+  if (allocated) {
+    error->all(FLERR, "pair_e3gnn coeff called twice");
+  }
+  allocate();
+
+  if (strcmp(arg[0], "*") != 0 || strcmp(arg[1], "*") != 0) {
+    error->all(FLERR,
+               "e3gnn: first and second input of pair_coeff should be '*'");
+  }
+  // expected input : pair_coeff * * pot.pth type_name1 type_name2 ...
+
+  std::unordered_map<std::string, std::string> meta_dict = {
+      {"chemical_symbols_to_index", ""},
+      {"cutoff", ""},
+      {"num_species", ""},
+      {"model_type", ""},
+      {"version", ""},
+      {"dtype", ""},
+      {"time", ""},
+      {"comm_size", ""}};
+
+  // model loading from input
+  int n_model = std::stoi(arg[2]);
+  int chem_arg_i = 4;
+  std::vector<std::string> model_fnames;
+  if (std::filesystem::exists(arg[3])) {
+    if (std::filesystem::is_directory(arg[3])) {
+      auto headf = std::string(arg[3]);
+      for (int i = 0; i < n_model; i++) {
+        auto stri = std::to_string(i);
+        model_fnames.push_back(headf + "/deployed_parallel_" + stri + ".pt");
+      }
+    } else if (std::filesystem::is_regular_file(arg[3])) {
+      for (int i = 3; i < n_model + 3; i++) {
+        model_fnames.push_back(std::string(arg[i]));
+      }
+      chem_arg_i = n_model + 3;
+    } else {
+      error->all(FLERR, "No such file or directory:" + std::string(arg[3]));
+    }
+  }
+
+  for (const auto &modelf : model_fnames) {
+    if (!std::filesystem::is_regular_file(modelf)) {
+      error->all(FLERR, "Expected this is a regular file:" + modelf);
+    }
+    model_list.push_back(torch::jit::load(modelf, device, meta_dict));
+  }
+
+  torch::jit::setGraphExecutorOptimize(false);
+  torch::jit::FusionStrategy strategy;
+  // strategy = {{torch::jit::FusionBehavior::DYNAMIC, 3}};
+  strategy = {{torch::jit::FusionBehavior::STATIC, 0}};
+  torch::jit::setFusionStrategy(strategy);
+
+  cutoff = std::stod(meta_dict["cutoff"]);
+
+  // maximum possible size of per atom x before last convolution
+  int comm_size = std::stod(meta_dict["comm_size"]);
+
+  // to initialize buffer size for communication
+  comm_forward = comm_size;
+  comm_reverse = comm_size;
+
+  cutoff_square = cutoff * cutoff;
+
+  if (meta_dict["model_type"].compare("E3_equivariant_model") != 0) {
+    error->all(FLERR, "given model type is not E3_equivariant_model");
+  }
+
+  std::string chem_str = meta_dict["chemical_symbols_to_index"];
+  int ntypes = atom->ntypes;
+
+  auto delim = " ";
+  char *tok = std::strtok(const_cast<char *>(chem_str.c_str()), delim);
+  std::vector<std::string> chem_vec;
+  while (tok != nullptr) {
+    chem_vec.push_back(std::string(tok));
+    tok = std::strtok(nullptr, delim);
+  }
+
+  // what if unknown chemical specie is in arg? should I abort? is there any use
+  // case for that?
+  bool found_flag = false;
+  int n_chem = narg - chem_arg_i;
+  for (int i = 0; i < n_chem; i++) {
+    found_flag = false;
+    for (int j = 0; j < chem_vec.size(); j++) {
+      if (chem_vec[j].compare(arg[i + chem_arg_i]) == 0) {
+        map[i + 1] = j; // store from 1, (not 0)
+        found_flag = true;
+        if (lmp->logfile) {
+          fprintf(lmp->logfile, "Chemical specie '%s' is assigned to type %d\n",
+                  arg[i + chem_arg_i], i + 1);
+          break;
+        }
+      }
+    }
+    if (!found_flag) {
+      error->all(FLERR, "Unknown chemical specie is given or the number of "
+                        "potential files is not consistent");
+    }
+  }
+
+  for (int i = 1; i <= ntypes; i++) {
+    for (int j = 1; j <= ntypes; j++) {
+      if ((map[i] >= 0) && (map[j] >= 0)) {
+        setflag[i][j] = 1;
+        cutsq[i][j] = cutoff * cutoff;
+      }
+    }
+  }
+
+  if (lmp->logfile) {
+    fprintf(lmp->logfile, "from sevenn version '%s' ",
+            meta_dict["version"].c_str());
+    fprintf(lmp->logfile, "%s precision model, deployed when: %s\n",
+            meta_dict["dtype"].c_str(), meta_dict["time"].c_str());
+  }
+}
+
+// init specific to this pair
+void PairE3GNNParallel::init_style() {
+  // full neighbor list & newton on
+  if (force->newton_pair == 0) {
+    error->all(FLERR, "Pair style e3gnn/parallel requires newton pair on");
+  }
+  neighbor->add_request(this, NeighConst::REQ_FULL);
+}
+
+double PairE3GNNParallel::init_one(int i, int j) { return cutoff; }
+
+void PairE3GNNParallel::notify_proc_ids(const int *sendproc, const int *recvproc) {
+  for (int iswap = 0; iswap < 6; iswap++) {
+    this->sendproc[iswap] = sendproc[iswap];
+    this->recvproc[iswap]= recvproc[iswap];
+  }
+}
+
+void PairE3GNNParallel::comm_preprocess() {
+  assert(!comm_preprocess_done);
+  CommBrick *comm_brick = dynamic_cast<CommBrick *>(comm);
+
+  // fake lammps communication call to preprocess index
+  // gives complete comm_index_pack, unpack_forward, and extra_graph_idx_map
+  comm_brick->forward_comm(this);
+
+  std::map<int, std::set<int>> already_met_map;
+  for (int comm_phase = 0; comm_phase < 6; comm_phase++) {
+    const int n = comm_index_pack_forward[comm_phase].size();
+    int sproc = this->sendproc[comm_phase];
+    if (already_met_map.count(sproc) == 0) {
+      already_met_map.insert({sproc, std::set<int>()});
+    }
+
+    // for unpack_reverse, Ignore duplicated index by 'already_met'
+    std::vector<long> &idx_map_forward = comm_index_pack_forward[comm_phase];
+    std::vector<long> &idx_map_reverse = comm_index_unpack_reverse[comm_phase];
+    std::set<int>& already_met = already_met_map[sproc];
+    // the last index of x_comm is used to trash unnecessary values
+    const int trash_index =
+        graph_size + static_cast<int>(extra_graph_idx_map.size()); //+ 1;
+    for (int i = 0; i < n; i++) {
+      const int idx = idx_map_forward[i];
+      if (idx < graph_size) {
+        if (already_met.count(idx) == 1) {
+          idx_map_reverse.push_back(trash_index);
+        } else {
+          idx_map_reverse.push_back(idx);
+          already_met.insert(idx);
+        }
+      } else {
+        idx_map_reverse.push_back(idx);
+      }
+    }
+
+    if (use_cuda_mpi) {
+      comm_index_pack_forward_tensor[comm_phase] = torch::from_blob(idx_map_forward.data(), idx_map_forward.size(), INTEGER_TYPE).to(device);
+
+      auto upmap = comm_index_unpack_forward[comm_phase];
+      comm_index_unpack_forward_tensor[comm_phase] = torch::from_blob(upmap.data(), upmap.size(), INTEGER_TYPE).to(device);
+      comm_index_unpack_reverse_tensor[comm_phase] = torch::from_blob(idx_map_reverse.data(), idx_map_reverse.size(), INTEGER_TYPE).to(device);
+    }
+  }
+  comm_preprocess_done = true;
+}
+
+// called from comm_brick if comm_preprocess_done is false
+void PairE3GNNParallel::pack_forward_init(int n, int *list_send,
+                                          int comm_phase) {
+  std::vector<long> &idx_map = comm_index_pack_forward[comm_phase];
+
+  idx_map.reserve(n);
+
+  int i, j;
+  int nlocal = list->inum;
+  tagint *tag = atom->tag;
+
+  for (i = 0; i < n; i++) {
+    int list_i = list_send[i];
+    int graph_idx = tag_to_graph_idx_ptr[tag[list_i]];
+
+    if (graph_idx != -1) {
+      // known atom (local atom + ghost atom inside cutoff)
+      idx_map.push_back(graph_idx);
+    } else {
+      // unknown atom, these are not used in computation in this process
+      // instead, this process is used to hand over these atoms to other proecss
+      // hold them in continuous manner for flexible tensor operations later
+      if (extra_graph_idx_map.find(list_i) != extra_graph_idx_map.end()) {
+        idx_map.push_back(extra_graph_idx_map[list_i]);
+      } else {
+        // unknown atom at pack forward, ghost atom outside cutoff?
+        extra_graph_idx_map[i] = graph_size + extra_graph_idx_map.size();
+        idx_map.push_back(extra_graph_idx_map[i]); // same as list_i in pack
+      }
+    }
+  }
+}
+
+// called from comm_brick if comm_preprocess_done is false
+void PairE3GNNParallel::unpack_forward_init(int n, int first, int comm_phase) {
+  std::vector<long> &idx_map = comm_index_unpack_forward[comm_phase];
+
+  idx_map.reserve(n);
+
+  int i, j, last;
+  last = first + n;
+  int nlocal = list->inum;
+  tagint *tag = atom->tag;
+
+  for (i = first; i < last; i++) {
+    int graph_idx = tag_to_graph_idx_ptr[tag[i]];
+    if (graph_idx != -1) {
+      idx_map.push_back(graph_idx);
+    } else {
+      extra_graph_idx_map[i] = graph_size + extra_graph_idx_map.size();
+      idx_map.push_back(extra_graph_idx_map[i]); // same as list_i in pack
+    }
+  }
+}
+
+int PairE3GNNParallel::pack_forward_comm_gnn(float *buf, int comm_phase) {
+  std::vector<long> &idx_map = comm_index_pack_forward[comm_phase];
+  const int n = static_cast<int>(idx_map.size());
+  if (use_cuda_mpi && n != 0) {
+    torch::Tensor &idx_map_tensor = comm_index_pack_forward_tensor[comm_phase];
+    auto selected = x_comm.index_select(0, idx_map_tensor); // its size is x_dim * n
+    cudaError_t cuda_err =
+        cudaMemcpy(buf, selected.data_ptr<float>(), (x_dim * n) * sizeof(float),
+                   cudaMemcpyDeviceToDevice);
+  } else {
+    int i, j, m;
+    m = 0;
+    for (i = 0; i < n; i++) {
+      const int idx = static_cast<int>(idx_map.at(i));
+      float *from = x_comm[idx].data_ptr<float>();
+      for (j = 0; j < x_dim; j++) {
+        buf[m++] = from[j];
+      }
+    }
+  }
+  if (print_info) {
+    std::cout << world_rank << " comm_phase: " << comm_phase << std::endl;
+    std::cout << world_rank << " pack_forward x_dim: " << x_dim << std::endl;
+    std::cout << world_rank << " pack_forward n: " << n << std::endl;
+    std::cout << world_rank << " pack_forward x_dim*n: " << x_dim * n
+              << std::endl;
+    double Msend = static_cast<double>(x_dim * n * 4) / (1024 * 1024);
+    std::cout << world_rank << " send size(MB): " << Msend << "\n" << std::endl;
+  }
+  return x_dim * n;
+}
+
+void PairE3GNNParallel::unpack_forward_comm_gnn(float *buf, int comm_phase) {
+  std::vector<long> &idx_map = comm_index_unpack_forward[comm_phase];
+  const int n = static_cast<int>(idx_map.size());
+
+  if (use_cuda_mpi && n != 0) {
+    torch::Tensor &idx_map_tensor = comm_index_unpack_forward_tensor[comm_phase];
+    auto buf_tensor =
+        torch::from_blob(buf, {n, x_dim}, FLOAT_TYPE.device(device));
+    x_comm.scatter_(0, idx_map_tensor.repeat_interleave(x_dim).view({n, x_dim}),
+                    buf_tensor);
+  } else {
+    int i, j, m;
+    m = 0;
+    for (i = 0; i < n; i++) {
+      const int idx = static_cast<int>(idx_map.at(i));
+      float *to = x_comm[idx].data_ptr<float>();
+      for (j = 0; j < x_dim; j++) {
+        to[j] = buf[m++];
+      }
+    }
+  }
+}
+
+int PairE3GNNParallel::pack_reverse_comm_gnn(float *buf, int comm_phase) {
+  std::vector<long> &idx_map = comm_index_unpack_forward[comm_phase];
+  const int n = static_cast<int>(idx_map.size());
+
+  if (use_cuda_mpi && n != 0) {
+    torch::Tensor &idx_map_tensor = comm_index_unpack_forward_tensor[comm_phase];
+    auto selected = x_comm.index_select(0, idx_map_tensor);
+    cudaError_t cuda_err = cudaMemcpy(buf, selected.data_ptr<float>(), (x_dim * n) * sizeof(float), cudaMemcpyDeviceToDevice);
+  } else {
+    int i, j, m;
+    m = 0;
+    for (i = 0; i < n; i++) {
+      const int idx = static_cast<int>(idx_map.at(i));
+      float *from = x_comm[idx].data_ptr<float>();
+      for (j = 0; j < x_dim; j++) {
+        buf[m++] = from[j];
+      }
+    }
+  }
+  if (print_info) {
+    std::cout << world_rank << " comm_phase: " << comm_phase << std::endl;
+    std::cout << world_rank << " pack_reverse x_dim: " << x_dim << std::endl;
+    std::cout << world_rank << " pack_reverse n: " << n << std::endl;
+    std::cout << world_rank << " pack_reverse x_dim*n: " << x_dim * n
+              << std::endl;
+    double Msend = static_cast<double>(x_dim * n * 4) / (1024 * 1024);
+  }
+  return x_dim * n;
+}
+
+void PairE3GNNParallel::unpack_reverse_comm_gnn(float *buf, int comm_phase) {
+  std::vector<long> &idx_map = comm_index_unpack_reverse[comm_phase];
+  const int n = static_cast<int>(idx_map.size());
+
+  if (use_cuda_mpi && n != 0) {
+    torch::Tensor &idx_map_tensor = comm_index_unpack_reverse_tensor[comm_phase];
+    auto buf_tensor =
+        torch::from_blob(buf, {n, x_dim}, FLOAT_TYPE.device(device));
+    x_comm.scatter_(0, idx_map_tensor.repeat_interleave(x_dim).view({n, x_dim}),
+                    buf_tensor, "add");
+  } else {
+    int i, j, m;
+    m = 0;
+    for (i = 0; i < n; i++) {
+      const int idx = static_cast<int>(idx_map.at(i));
+      if (idx == -1) {
+        m += x_dim;
+        continue;
+      }
+      float *to = x_comm[idx].data_ptr<float>();
+      for (j = 0; j < x_dim; j++) {
+        to[j] += buf[m++];
+      }
+    }
+  }
+}
--- a/mace-bench/3rdparty/SevenNet/sevenn/pair_e3gnn/pair_e3gnn_parallel.h
+++ b/mace-bench/3rdparty/SevenNet/sevenn/pair_e3gnn/pair_e3gnn_parallel.h
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+http://lammps.sandia.gov, Sandia National Laboratories
+Steve Plimpton, sjplimp@sandia.gov
+
+Copyright (2003) Sandia Corporation.  Under the terms of Contract
+DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+certain rights in this software.  This software is distributed under
+the GNU General Public License.
+
+See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+PairStyle(e3gnn/parallel, PairE3GNNParallel)
+
+#else
+
+#ifndef LMP_PAIR_E3GNN_PARALLEL
+#define LMP_PAIR_E3GNN_PARALLEL
+
+#include "pair.h"
+
+#include <torch/torch.h>
+#include <vector>
+
+namespace LAMMPS_NS {
+class PairE3GNNParallel : public Pair {
+private:
+  double cutoff;
+  double cutoff_square;
+  std::vector<torch::jit::Module> model_list;
+  torch::Device device = torch::kCPU;
+  torch::Device device_comm = torch::kCPU;
+  torch::Device get_cuda_device();
+  bool use_cuda_mpi;
+
+  // for communication
+  // Most of these variables for communication is temporary and valid for only
+  // one MD step.
+  int x_dim; // to determine per atom data size
+  int graph_size;
+  torch::Tensor x_comm; // x_local + x_ghost + x_comm_extra
+
+  void comm_preprocess();
+  bool comm_preprocess_done = false;
+
+  // temporary variables holds for each compute step
+  std::unordered_map<int, long> extra_graph_idx_map;
+  // To use scatter, store long instead of int
+  // array of vector
+  std::vector<long> comm_index_pack_forward[6];
+  std::vector<long> comm_index_unpack_forward[6];
+  std::vector<long> comm_index_unpack_reverse[6];
+
+  // its size is 6 and initialized at comm_preprocess()
+  torch::Tensor comm_index_pack_forward_tensor[6];
+  torch::Tensor comm_index_unpack_forward_tensor[6];
+  torch::Tensor comm_index_unpack_reverse_tensor[6];
+
+  // to use tag_to_graph_idx inside comm methods
+  int *tag_to_graph_idx_ptr = nullptr;
+
+  int sendproc[6];
+  int recvproc[6];
+
+public:
+  PairE3GNNParallel(class LAMMPS *);
+  ~PairE3GNNParallel();
+
+  // TODO: keep encapsulation..
+  void compute(int, int) override;
+  void settings(int, char **) override;
+  // read Atom type string from input script & related coeff
+  void coeff(int, char **) override;
+  void allocate();
+
+  void pack_forward_init(int n, int *list, int comm_phase);
+  void unpack_forward_init(int n, int first, int comm_phase);
+
+  int pack_forward_comm_gnn(float *buf, int comm_phase);
+  void unpack_forward_comm_gnn(float *buf, int comm_phase);
+  int pack_reverse_comm_gnn(float *buf, int comm_phase);
+  void unpack_reverse_comm_gnn(float *buf, int comm_phase);
+
+  void init_style() override;
+  double init_one(int, int) override;
+
+  int get_x_dim();
+  bool use_cuda_mpi_();
+  bool is_comm_preprocess_done();
+  void notify_proc_ids(const int *sendproc, const int *recvproc);
+
+  bool print_info = false;
+  int world_rank;
+};
+
+class DeviceBuffManager {
+private:
+  DeviceBuffManager() {}
+  DeviceBuffManager(const DeviceBuffManager &);
+  DeviceBuffManager &operator=(const DeviceBuffManager &);
+
+  float *buf_send_device = nullptr;
+  float *buf_recv_device = nullptr;
+  int send_buf_size = 0;
+  int recv_buf_size = 0;
+
+public:
+  static DeviceBuffManager &getInstance();
+  void get_buffer(int, int, float *&, float *&);
+
+  ~DeviceBuffManager();
+};
+} // namespace LAMMPS_NS
+
+#endif
+#endif
--- a/mace-bench/3rdparty/SevenNet/sevenn/pair_e3gnn/patch_lammps.sh
+++ b/mace-bench/3rdparty/SevenNet/sevenn/pair_e3gnn/patch_lammps.sh
+#!/bin/bash
+
+lammps_root=$1
+cxx_standard=$2 # 14, 17
+d3_support=$3 # 1, 0
+SCRIPT_DIR=$(dirname "${BASH_SOURCE[0]}")
+
+###########################################
+# Check if the given arguments are valid  #
+###########################################
+
+# Check the number of arguments
+if [ "$#" -ne 3 ]; then
+    echo "Usage: sh patch_lammps.sh {lammps_root} {cxx_standard} {d3_support}"
+    echo "  {lammps_root}: Root directory of LAMMPS source"
+    echo "  {cxx_standard}: C++ standard (14, 17)"
+    echo "  {d3_support}: Support for pair_d3 (1, 0)"
+    exit 1
+fi
+
+# Check if the lammps_root directory exists
+if [ ! -d "$lammps_root" ]; then
+    echo "Error: No such directory: $lammps_root"
+    exit 1
+fi
+
+# Check if the given directory is the root of LAMMPS source
+if [ ! -d "$lammps_root/cmake" ] && [ ! -d "$lammps_root/potentials" ]; then
+    echo "Error: Given $lammps_root is not a root of LAMMPS source"
+    exit 1
+fi
+
+# Check if the script is being run from the root of SevenNet
+if [ ! -f "${SCRIPT_DIR}/pair_e3gnn.cpp" ]; then
+    echo "Error: Script executed in a wrong directory"
+    exit 1
+fi
+
+# Check if the patch is already applied
+if [ -f "$lammps_root/src/pair_e3gnn.cpp" ]; then
+    echo "----------------------------------------------------------"
+    echo "Seems like given LAMMPS is already patched."
+    echo "Try again after removing src/pair_e3gnn.cpp to force patch"
+    echo "----------------------------------------------------------"
+    echo "Example build commands, under LAMMPS root"
+    echo "  mkdir build; cd build"
+    echo "  cmake ../cmake -DCMAKE_PREFIX_PATH=$(python -c 'import torch;print(torch.utils.cmake_prefix_path)')"
+    echo "  make -j 4"
+    exit 0
+fi
+
+# Check if OpenMPI exists and if it is CUDA-aware
+if command -v ompi_info &> /dev/null; then
+    cuda_support=$(ompi_info --parsable --all | grep mpi_built_with_cuda_support:value)
+    if [[ -z "$cuda_support" ]]; then
+        echo "OpenMPI not found, parallel performance is not optimal"
+    elif [[ "$cuda_support" == *"true" ]]; then
+        echo "OpenMPI is CUDA aware"
+    else
+        echo "This system's OpenMPI is not 'CUDA aware', parallel performance is not optimal"
+    fi
+else
+    echo "OpenMPI not found, parallel performance is not optimal"
+fi
+
+# Extract LAMMPS version and update
+lammps_version=$(grep "#define LAMMPS_VERSION" $lammps_root/src/version.h | awk '{print $3, $4, $5}' | tr -d '"')
+
+# Combine version and update
+detected_version="$lammps_version"
+required_version="2 Aug 2023"  # Example required version
+
+# Check if the detected version is compatible
+if [[ "$detected_version" != "$required_version" ]]; then
+    echo "Warning: Detected LAMMPS version ($detected_version) may not be compatible. Required version: $required_version"
+fi
+
+###########################################
+# Backup original LAMMPS source code      #
+###########################################
+
+# Create a backup directory if it doesn't exist
+backup_dir="$lammps_root/_backups"
+mkdir -p $backup_dir
+
+# Copy comm_* from original LAMMPS source as backup
+cp $lammps_root/src/comm_brick.cpp $backup_dir/
+cp $lammps_root/src/comm_brick.h $backup_dir/
+
+# Copy cmake/CMakeLists.txt from original source as backup
+cp $lammps_root/cmake/CMakeLists.txt $backup_dir/CMakeLists.txt
+
+###########################################
+# Patch LAMMPS source code: e3gnn         #
+###########################################
+
+# 1. Copy pair_e3gnn files to LAMMPS source
+cp $SCRIPT_DIR/{pair_e3gnn,pair_e3gnn_parallel,comm_brick}.cpp $lammps_root/src/
+cp $SCRIPT_DIR/{pair_e3gnn,pair_e3gnn_parallel,comm_brick}.h $lammps_root/src/
+
+# 2. Patch cmake/CMakeLists.txt
+sed -i "s/set(CMAKE_CXX_STANDARD 11)/set(CMAKE_CXX_STANDARD $cxx_standard)/" $lammps_root/cmake/CMakeLists.txt
+cat >> $lammps_root/cmake/CMakeLists.txt << "EOF"
+
+find_package(Torch REQUIRED)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
+target_link_libraries(lammps PUBLIC "${TORCH_LIBRARIES}")
+EOF
+
+###########################################
+# Patch LAMMPS source code: d3            #
+###########################################
+
+if [ "$d3_support" -ne 0 ]; then
+
+# 1. Copy pair_d3 files to LAMMPS source
+cp $SCRIPT_DIR/pair_d3.cu $lammps_root/src/
+cp $SCRIPT_DIR/pair_d3.h $lammps_root/src/
+cp $SCRIPT_DIR/pair_d3_pars.h $lammps_root/src/
+
+# 2. Patch cmake/CMakeLists.txt
+sed -i "s/project(lammps CXX)/project(lammps CXX CUDA)/" $lammps_root/cmake/CMakeLists.txt
+sed -i "s/\${LAMMPS_SOURCE_DIR}\/\[\^.\]\*\.cpp/\${LAMMPS_SOURCE_DIR}\/\[\^.\]\*\.cpp  \${LAMMPS_SOURCE_DIR}\/\[\^.\]\*\.cu/" $lammps_root/cmake/CMakeLists.txt
+cat >> $lammps_root/cmake/CMakeLists.txt << "EOF"
+
+find_package(CUDA)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -fmad=false -O3")
+string(REPLACE "-gencode arch=compute_50,code=sm_50" "" CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
+target_link_libraries(lammps PUBLIC ${CUDA_LIBRARIES} cuda)
+EOF
+
+fi
+
+###########################################
+# Print changes and backup file locations #
+###########################################
+
+# Print changes and backup file locations
+echo "Changes made:"
+echo "  - Original LAMMPS files (src/comm_brick.*, cmake/CMakeList.txt) are in {lammps_root}/_backups"
+echo "  - Copied contents of pair_e3gnn to $lammps_root/src/"
+echo "  - Patched CMakeLists.txt: include LibTorch, CXX_STANDARD $cxx_standard"
+if [ "$d3_support" -ne 0 ]; then
+    echo "  - Copied contents of pair_d3 to $lammps_root/src/"
+    echo "  - Patched CMakeLists.txt: include CUDA"
+fi
+
+# Provide example cmake command to the user
+echo "Example build commands, under LAMMPS root"
+echo "  mkdir build; cd build"
+echo "  cmake ../cmake -DCMAKE_PREFIX_PATH=$(python -c 'import torch;print(torch.utils.cmake_prefix_path)')"
+echo "  make -j 4"
+
+exit 0
--- a/mace-bench/3rdparty/SevenNet/sevenn/presets/MF_0.yaml
+++ b/mace-bench/3rdparty/SevenNet/sevenn/presets/MF_0.yaml
+model:
+    chemical_species: 'univ'  # Ready for 119 elements
+    cutoff: 5.0
+    channel: 128
+    is_parity: False
+    lmax: 2
+    num_convolution_layer: 5
+    irreps_manual:
+        - "128x0e"
+        - "128x0e+64x1e+32x2e"
+        - "128x0e+64x1e+32x2e"
+        - "128x0e+64x1e+32x2e"
+        - "128x0e+64x1e+32x2e"
+        - "128x0e"
+
+    weight_nn_hidden_neurons: [64, 64]
+    radial_basis:
+        radial_basis_name: 'bessel'
+        bessel_basis_num: 8
+    cutoff_function:
+        cutoff_function_name: 'XPLOR'
+        cutoff_on: 4.5
+
+    act_gate: {'e': 'silu', 'o': 'tanh'}
+    act_scalar: {'e': 'silu', 'o': 'tanh'}
+
+    conv_denominator: 'avg_num_neigh'
+    train_shift_scale: False
+    train_denominator: False
+    self_connection_type: 'linear'
+
+    # Following are used to specify which part of the model would utilize fidelity-dependent parameters for multi-fidelity training.
+    # For detailed architecture, please refer to https://arxiv.org/abs/2409.07947
+    # Parts using fidelity-dependent weights are indicated as `Modified linear` layers in Figure 1.
+    use_modal_node_embedding: False    # If true, use modified linear layer in atom-type embedding layer.
+    use_modal_self_inter_intro: True   # If true, use modified linear layers in self-interaction block before the convolution in the interaction blocks.
+    use_modal_self_inter_outro: True   # If true, use modified linear layers in self-interaction block after the convolution in the interaction blocks.
+    use_modal_output_block: True       # If true, use modified linear layer in the output block.
+train:
+    train_shuffle: True
+    random_seed: 777
+    is_train_stress : True
+    epoch: 200
+
+    loss: 'Huber'
+    loss_param:
+        delta: 0.01
+
+    optimizer: 'adam'
+    optim_param:
+        lr: 0.01
+    scheduler: 'linearlr'
+    scheduler_param:
+        start_factor: 1.0
+        total_iters: 200
+        end_factor: 0.0001
+
+    force_loss_weight : 1.00
+    stress_loss_weight: 0.01
+
+    error_record:
+        - ['Energy', 'MAE']
+        - ['Force', 'MAE']
+        - ['Stress', 'MAE']
+        - ['Energy', 'Loss']
+        - ['Force', 'Loss']
+        - ['Stress', 'Loss']
+        - ['TotalLoss', 'None']
+
+    per_epoch: 10
+    use_modality: True
+    use_weight: True
+
+data:
+    batch_size: 64
+    shift: 'elemwise_reference_energies'
+    scale: 1.73
+
+    use_modal_wise_shift: True    # If true, use different atomic energy shift for each database
+    use_modal_wise_scale: False   # If true, use different atomic energy scale for each database
+
+    load_trainset_path:
+
+      - data_modality: pbe  # Name of database
+        file_list:
+          - file: "**path to PBE database**"  # ASE readable or .pt file (graph.pt)
+        data_weight:
+            energy: 1.0
+            force: 1.0   # This weight would be additionally multiplied to `force_loss_weight` for this database
+            stress: 1.0  # This weight would be additionally multiplied to `stress_loss_weight` for this database
+
+      - data_modality: r2scan
+        file_list:
+          - file: "**path to r2SCAN database**"
+        data_weight:
+            energy: 7.0
+            force: 7.0
+            stress: 7.0
+
+    load_pbe_validset_path:   # any name starts with 'load' and ends with 'set_path'
+      - data_modality: pbe   # modality must be given for mm valid set
+        file_list:
+          - file: "**path to PBE test set**"
+
+    load_scan_validset_path:
+      - data_modality: r2scan
+        file_list:
+          - file: "**path to r2SCAN test set**"
--- a/mace-bench/3rdparty/SevenNet/sevenn/presets/base.yaml
+++ b/mace-bench/3rdparty/SevenNet/sevenn/presets/base.yaml
+# Example input.yaml for training SevenNet.
+# '*' signifies default. You can check log.sevenn for defaults.
+
+model:
+    chemical_species: 'Auto'                      # Elements model should know. [ 'Univ' | 'Auto' | manual_user_input ]
+    cutoff: 5.0                                   # Cutoff radius in Angstroms. If two atoms are within the cutoff, they are connected.
+    channel: 32                                   # The multiplicity(channel) of node features.
+    lmax: 2                                       # Maximum order of irreducible representations (rotation order).
+    num_convolution_layer: 3                      # The number of message passing layers.
+
+    #irreps_manual:                               # Manually set irreps of the model in each layer
+        #- "128x0e"
+        #- "128x0e+64x1e+32x2e"
+        #- "128x0e+64x1e+32x2e"
+        #- "128x0e+64x1e+32x2e"
+        #- "128x0e+64x1e+32x2e"
+        #- "128x0e"
+
+    weight_nn_hidden_neurons: [64, 64]            # Hidden neurons in convolution weight neural network
+    radial_basis:                                 # Function and its parameters to encode radial distance
+        radial_basis_name: 'bessel'               # Only 'bessel' is currently supported
+        bessel_basis_num: 8
+    cutoff_function:                              # Envelop function, multiplied to radial_basis functions to init edge features
+        cutoff_function_name: 'poly_cut'          # {'poly_cut' and 'poly_cut_p_value'} or {'XPLOR' and 'cutoff_on'}
+        poly_cut_p_value: 6
+
+    act_gate: {'e': 'silu', 'o': 'tanh'}          # Equivalent to 'nonlinearity_gates' in nequip
+    act_scalar: {'e': 'silu', 'o': 'tanh'}        # Equivalent to 'nonlinearity_scalars' in nequip
+
+    is_parity: False                              # Pairy True (E(3) group) or False (to SE(3) group)
+
+    self_connection_type: 'nequip'                # Default is 'nequip'. 'linear' is used for SevenNet-0. I recommend 'linear' for 'Univ' chemical_species
+
+    conv_denominator: "avg_num_neigh"             # Valid options are "avg_num_neigh*", "sqrt_avg_num_neigh", or float
+    train_denominator: False                      # Enable training for denominator in convolution layer
+    train_shift_scale: False                      # Enable training for shift & scale in output layer
+
+train:
+    random_seed: 1
+    is_train_stress: True                         # Includes stress in the loss function
+    epoch: 200                                    # Ends training after this number of epochs
+
+    #loss: 'Huber'                                # Default is 'mse' (mean squared error)
+    #loss_param:
+        #delta: 0.01
+
+    # Each optimizer and scheduler have different available parameters.
+    # You can refer to sevenn/train/optim.py for supporting optimizer & schedulers
+    optimizer: 'adam'                             # Options available are 'sgd', 'adagrad', 'adam', 'adamw', 'radam'
+    optim_param:
+        lr: 0.005
+    scheduler: 'exponentiallr'                    # 'steplr', 'multisteplr', 'exponentiallr', 'cosineannealinglr', 'reducelronplateau', 'linearlr'
+    scheduler_param:
+        gamma: 0.99
+
+    force_loss_weight: 0.1                        # Coefficient for force loss
+    stress_loss_weight: 1e-06                     # Coefficient for stress loss (to kbar unit)
+
+    per_epoch: 10                                 # Generate checkpoints every this epoch
+
+    # ['target y', 'metric']
+    # Target y: TotalEnergy, Energy, Force, Stress, Stress_GPa, TotalLoss
+    # Metric  : RMSE, MAE, or Loss
+    error_record:
+        - ['Energy', 'RMSE']
+        - ['Force', 'RMSE']
+        - ['Stress', 'RMSE']
+        - ['TotalLoss', 'None']
+
+    # Continue training model from given checkpoint, or pre-trained model checkpoint for fine-tuning
+    #continue:
+        #checkpoint: 'checkpoint_best.pth'         # Checkpoint of pre-trained model or a model want to continue training.
+        #reset_optimizer: False                    # Set True for fine-tuning
+        #reset_scheduler: False                    # Set True for fine-tuning
+
+data:
+    batch_size: 4                                 # Per GPU batch size.
+
+    shift: 'per_atom_energy_mean'                 # One of 'per_atom_energy_mean*', 'elemwise_reference_energies', float
+    scale: 'force_rms'                            # One of 'force_rms*', 'per_atom_energy_std', float
+
+    # SevenNet automatically matches data format from its filename.
+    # For those not `structure_list` or `.pt` files, assumes it is ASE readable
+    # In this case, below arguments are directly passed to `ase.io.read`
+    data_format_args:
+        index: ':'                                # see `https://wiki.fysik.dtu.dk/ase/ase/io/io.html` for more valid arguments
+
+    # validset is needed if you want '_best.pth' during training. If not, both validset and testset is optional.
+    load_trainset_path: ['./train_*.extxyz']  # Example of using ase as data_format, support multiple files and expansion(*)
+    load_validset_path: ['./valid.extxyz']
+    load_testset_path:  ['./sevenn_data/mydata.pt']  # Graph can be preprocessed using `sevenn_graph_build` and accessible like this
--- a/mace-bench/3rdparty/SevenNet/sevenn/presets/fine_tune.yaml
+++ b/mace-bench/3rdparty/SevenNet/sevenn/presets/fine_tune.yaml
+# Example input.yaml for fine-tuning sevennet-0
+# '*' signifies default. You can check log.sevenn for defaults.
+
+model:  # model keys should be consistent except for train_* keys
+    chemical_species: 'Auto'
+    cutoff: 5.0
+    channel: 128
+    is_parity: False
+    lmax: 2
+    num_convolution_layer: 5
+    irreps_manual:
+        - "128x0e"
+        - "128x0e+64x1e+32x2e"
+        - "128x0e+64x1e+32x2e"
+        - "128x0e+64x1e+32x2e"
+        - "128x0e+64x1e+32x2e"
+        - "128x0e"
+
+    weight_nn_hidden_neurons: [64, 64]
+    radial_basis:
+        radial_basis_name: 'bessel'
+        bessel_basis_num: 8
+    cutoff_function:
+        cutoff_function_name: 'XPLOR'
+        cutoff_on: 4.5
+    self_connection_type: 'linear'
+
+    train_shift_scale: False   # customizable (True | False)
+    train_denominator: False   # customizable (True | False)
+
+train:  # Customizable
+    random_seed: 1
+    is_train_stress: True
+    epoch: 100
+
+    loss: 'Huber'  # keeping original loss function give better ft result
+    loss_param:
+        delta: 0.01
+
+    optimizer: 'adam'
+    optim_param:
+        lr: 0.004
+    scheduler: 'exponentiallr'
+    scheduler_param:
+        gamma: 0.99
+
+    force_loss_weight: 1.0
+    stress_loss_weight: 0.01
+
+    per_epoch: 10  # Generate checkpoints every this epoch
+
+    # ['target y', 'metric']
+    # Target y: TotalEnergy, Energy, Force, Stress, Stress_GPa, TotalLoss
+    # Metric  : RMSE, MAE, or Loss
+    error_record:
+        - ['Energy', 'RMSE']
+        - ['Force', 'RMSE']
+        - ['Stress', 'RMSE']
+        - ['TotalLoss', 'None']
+
+    continue:
+        reset_optimizer: True
+        reset_scheduler: True
+        reset_epoch: True
+        checkpoint: 'SevenNet-0_11July2024'
+
+data:  # Customizable
+    batch_size: 4
+    data_divide_ratio: 0.1
+
+    # SevenNet automatically matches data format from its filename.
+    # For those not `structure_list` or `.pt` files, assumes it is ASE readable
+    # In this case, below arguments are directly passed to `ase.io.read`
+    data_format_args:
+        index: ':'                                # see `https://wiki.fysik.dtu.dk/ase/ase/io/io.html` for more valid arguments
+
+    # validset is needed if you want '_best.pth' during training. If not, both validset and testset is optional.
+    load_trainset_path: ['./train_*.extxyz']  # Example of using ase as data_format, support multiple files and expansion(*)
+    load_validset_path: ['./valid.extxyz']
+    load_testset_path:  ['./sevenn_data/mydata.pt']  # Graph can be preprocessed using `sevenn_graph_build` and accessible like this
--- a/mace-bench/3rdparty/SevenNet/sevenn/presets/fine_tune_le.yaml
+++ b/mace-bench/3rdparty/SevenNet/sevenn/presets/fine_tune_le.yaml
+# Application of 7net-0 on liquid electrolyte system via fine-tuning
+# Paper: https://arxiv.org/abs/2501.05211
+
+model:
+    # parameters of SevenNet-0, should not be changed
+    chemical_species: 'auto'
+    cutoff: 5.0
+    channel: 128
+    is_parity: False
+    lmax: 2
+    num_convolution_layer: 5
+    irreps_manual:
+        - "128x0e"
+        - "128x0e+64x1e+32x2e"
+        - "128x0e+64x1e+32x2e"
+        - "128x0e+64x1e+32x2e"
+        - "128x0e+64x1e+32x2e"
+        - "128x0e"
+    weight_nn_hidden_neurons: [64, 64]
+    radial_basis:
+        radial_basis_name: 'bessel'
+        bessel_basis_num: 8
+    cutoff_function:
+        cutoff_function_name: 'XPLOR'
+        cutoff_on: 4.5
+
+    act_gate: {'e': 'silu', 'o': 'tanh'}
+    act_scalar: {'e': 'silu', 'o': 'tanh'}
+
+    self_connection_type: 'linear'
+
+    # useful for fine-tuning
+    train_shift_scale: True
+    train_avg_num_neigh: True
+
+train:
+    random_seed: 1
+    is_train_stress: True
+    epoch: 100  # we went through 100 epochs and chose checkpoint at 50 epoch where the error have reached plateau.
+
+    loss: 'Huber'
+    loss_param:
+        delta: 0.01
+
+    optimizer: 'adam'
+    optim_param:
+        lr: 0.0001
+    scheduler: 'linearlr'
+    scheduler_param:
+        start_factor: 1.0
+        total_iters: 600
+        end_factor: 0.000001
+
+    force_loss_weight: 1.00
+    stress_loss_weight: 1.00  # 7net-0 quantitatively lacked accuracy on pressure histograms compared to DFT, so we increased stress loss weight
+
+    error_record:
+        - ['Energy', 'RMSE']
+        - ['Force', 'RMSE']
+        - ['Stress', 'RMSE']
+        - ['Energy', 'MAE']
+        - ['Force', 'MAE']
+        - ['Stress', 'MAE']
+        - ['Energy', 'Loss']
+        - ['Force', 'Loss']
+        - ['Stress', 'Loss']
+        - ['TotalLoss', 'None']
+
+    per_epoch: 10   # Generate epoch every this number of times
+
+    continue:
+        use_statistic_values_of_checkpoint: True
+        checkpoint: '7net-0'  # fine-tuning from 7net-0
+        reset_optimizer: True
+        reset_scheduler: True
+
+data:
+    batch_size: 1   # our fine-tuning dataset had ~360 atoms per structure, so we used batch size of 1 to avoid GPU OOM error.
+    shift: 'elemwise_reference_energies'
+    scale: 1.858
+    data_format: 'ase'
+    data_divide_ratio: 0.05
+    load_dataset_path: ["./data/total.extxyz"]
--- a/mace-bench/3rdparty/SevenNet/sevenn/presets/multi_modal.yaml
+++ b/mace-bench/3rdparty/SevenNet/sevenn/presets/multi_modal.yaml
+model:
+    chemical_species: 'univ'  # Ready for 119 elements
+    cutoff: 6.0
+    channel: 128
+    is_parity: False
+    lmax: 3
+    num_convolution_layer: 3
+    irreps_manual:
+        - "128x0e"
+        - "128x0e+64x1e+32x2e+16x3e"
+        - "128x0e+64x1e+32x2e+16x3e"
+        - "128x0e"
+
+    weight_nn_hidden_neurons: [64, 64]
+    radial_basis:
+        radial_basis_name: 'bessel'
+        bessel_basis_num: 8
+    cutoff_function:
+        cutoff_function_name: 'XPLOR'
+        cutoff_on: 5.5
+
+    act_gate: {'e': 'silu', 'o': 'tanh'}
+    act_scalar: {'e': 'silu', 'o': 'tanh'}
+
+    conv_denominator: 'avg_num_neigh'
+    train_shift_scale: True
+    train_denominator: False
+    self_connection_type: 'linear'
+
+    # Following are used to specify which part of the model would utilize fidelity-dependent parameters for multi-fidelity training.
+    # For detailed architecture, please refer to https://arxiv.org/abs/2409.07947
+    # Parts using fidelity-dependent weights are indicated as `Modified linear` layers in Figure 1.
+    use_modal_node_embedding: False    # If true, use modified linear layer in atom-type embedding layer.
+    use_modal_self_inter_intro: True   # If true, use modified linear layers in self-interaction block before the convolution in the interaction blocks.
+    use_modal_self_inter_outro: True   # If true, use modified linear layers in self-interaction block after the convolution in the interaction blocks.
+    use_modal_output_block: True       # If true, use modified linear layer in the output block.
+train:
+    train_shuffle: True
+    random_seed: 777
+    is_train_stress : True
+    epoch: 200
+
+    loss: 'Huber'
+    loss_param:
+        delta: 0.01
+
+    optimizer: 'adam'
+    optim_param:
+        lr: 0.01
+    scheduler: 'linearlr'
+    scheduler_param:
+        start_factor: 1.0
+        total_iters: 200
+        end_factor: 0.0001
+
+    force_loss_weight : 1.00
+    stress_loss_weight: 0.01
+
+    error_record:
+        - ['Energy', 'MAE']
+        - ['Force', 'MAE']
+        - ['Stress', 'MAE']
+        - ['Energy', 'Loss']
+        - ['Force', 'Loss']
+        - ['Stress', 'Loss']
+        - ['TotalLoss', 'None']
+
+    per_epoch: 10
+    use_modality: True
+    use_weight: True
+
+data:
+    batch_size: 16
+    shift: 'elemwise_reference_energies'
+    scale: 'force_rms'
+
+    use_modal_wise_shift: True    # If true, use different atomic energy shift for each database
+    use_modal_wise_scale: False   # If true, use different atomic energy scale for each database
+
+    load_trainset_path:
+
+      - data_modality: pbe  # Name of database
+        file_list:
+          - file: "path to pbe dataset"  # ASE readable or .pt file (graph.pt)
+        data_weight:
+            energy: 1.0
+            force: 0.1   # This weight would be additionally multiplied to `force_loss_weight` for this database
+            stress: 1.0  # This weight would be additionally multiplied to `stress_loss_weight` for this database
+
+      - data_modality: scan
+        file_list:
+          - file: "path to scan dataset"
+        data_weight:
+            energy: 1.0
+            force: 10.0
+            stress: 1.0
+
+    load_pbe_validset_path:   # any name starts with 'load' and ends with 'set_path'
+      - data_modality: pbe   # modality must be given for mm valid set
+        file_list:
+          - file: "path to pbe validset"
+
+    load_scan_validset_path:
+      - data_modality: scan
+        file_list:
+          - file: "path to scan validset"
--- a/mace-bench/3rdparty/SevenNet/sevenn/presets/sevennet-0.yaml
+++ b/mace-bench/3rdparty/SevenNet/sevenn/presets/sevennet-0.yaml
+# SevenNet-0, should be run with `sevenn -m train_v1` as it uses old routine
+model:
+    chemical_species: 'auto'
+    cutoff: 5.0
+    channel: 128
+    is_parity: False
+    lmax: 2
+    num_convolution_layer: 5
+    irreps_manual:
+        - "128x0e"
+        - "128x0e+64x1e+32x2e"
+        - "128x0e+64x1e+32x2e"
+        - "128x0e+64x1e+32x2e"
+        - "128x0e+64x1e+32x2e"
+        - "128x0e"
+
+    weight_nn_hidden_neurons: [64, 64]
+    radial_basis:
+        radial_basis_name: 'bessel'
+        bessel_basis_num: 8
+    cutoff_function:
+        cutoff_function_name: 'XPLOR'
+        cutoff_on: 4.5
+
+    act_gate: {'e': 'silu', 'o': 'tanh'}
+    act_scalar: {'e': 'silu', 'o': 'tanh'}
+
+    conv_denominator: 'avg_num_neigh'
+    train_shift_scale: False
+    train_denominator: False
+    self_connection_type: 'linear'
+train:
+    train_shuffle: False
+    random_seed: 1
+    is_train_stress : True
+    epoch: 600
+
+    loss: 'Huber'
+    loss_param:
+        delta: 0.01
+
+    optimizer: 'adam'
+    optim_param:
+        lr: 0.01
+    scheduler: 'linearlr'
+    scheduler_param:
+        start_factor: 1.0
+        total_iters: 600
+        end_factor: 0.0001
+
+    force_loss_weight : 1.00
+    stress_loss_weight: 0.01
+
+    error_record:
+        - ['Energy', 'RMSE']
+        - ['Force', 'RMSE']
+        - ['Stress', 'RMSE']
+        - ['Energy', 'MAE']
+        - ['Force', 'MAE']
+        - ['Stress', 'MAE']
+        - ['Energy', 'Loss']
+        - ['Force', 'Loss']
+        - ['Stress', 'Loss']
+        - ['TotalLoss', 'None']
+
+    per_epoch: 10
+    # continue:
+    #    checkpoint: './checkpoint_last.pth'
+    #    reset_optimizer: False
+    #    reset_scheduler: False
+data:
+    batch_size: 128  # per GPU batch size, as the model trained with 32 GPUs, the effective batch size equals 4096.
+    scale: 'per_atom_energy_std'
+    shift: 'elemwise_reference_energies'
+
+    data_format: 'ase'
+    save_by_train_valid: False
+    load_dataset_path: ["path_to_MPtrj_total.sevenn_data"]
+    load_validset_path: ["validaset.sevenn_data"]
--- a/mace-bench/3rdparty/SevenNet/sevenn/presets/sevennet-l3i5.yaml
+++ b/mace-bench/3rdparty/SevenNet/sevenn/presets/sevennet-l3i5.yaml
+model:
+    chemical_species: auto
+
+    cutoff: 5.0
+    irreps_manual:
+    - 128x0e
+    - 128x0e+64x1e+32x2e+32x3e
+    - 128x0e+64x1e+32x2e+32x3e
+    - 128x0e+64x1e+32x2e+32x3e
+    - 128x0e+64x1e+32x2e+32x3e
+    - 128x0e
+    channel: 128
+    lmax: 3
+    num_convolution_layer: 5
+    is_parity: false
+    radial_basis:
+        radial_basis_name: bessel
+        bessel_basis_num: 8
+    cutoff_function:
+        cutoff_function_name: poly_cut
+        poly_cut_p_value: 6
+
+    act_radial: silu
+    weight_nn_hidden_neurons:
+    - 64
+    - 64
+    act_scalar:
+        e: silu
+        o: tanh
+    act_gate:
+        e: silu
+        o: tanh
+
+    train_denominator: false
+    train_shift_scale: false
+    use_bias_in_linear: false
+
+    readout_as_fcn: false
+    self_connection_type: linear
+    interaction_type: nequip
+
+train:
+    random_seed: 1
+    epoch: 600
+    loss: Huber
+    loss_param:
+        delta: 0.01
+    optimizer: adam
+    optim_param:
+        lr: 0.01
+    scheduler: linearlr
+    scheduler_param:
+        start_factor: 1.0
+        total_iters: 600
+        end_factor: 0.0001
+    force_loss_weight: 1.0
+    stress_loss_weight: 0.01
+    per_epoch: 10
+    is_train_stress: true
+    train_shuffle: true
+    error_record:
+    -   - Energy
+        - MAE
+    -   - Energy
+        - RMSE
+    -   - Force
+        - MAE
+    -   - Force
+        - RMSE
+    -   - Stress
+        - MAE
+    -   - Stress
+        - RMSE
+    -   - Energy
+        - Loss
+    -   - Force
+        - Loss
+    -   - Stress
+        - Loss
+    -   - TotalLoss
+        - None
+    best_metric: TotalLoss
+data:
+    data_format: ase
+    data_format_args: {}
+
+    batch_size: 1024  # global batch size, should be divided by the number of GPUs
+
+    load_trainset_path: '**path_to_trainset**'
+    load_validset_path: '**path_to_validset**'
+    shift: 'elemwise_reference_energies'
+    scale: 'force_rms'
--- a/mace-bench/3rdparty/SevenNet/sevenn/scripts/__init__.py
+++ b/mace-bench/3rdparty/SevenNet/sevenn/scripts/__init__.py
--- a/mace-bench/3rdparty/SevenNet/sevenn/scripts/__pycache__/__init__.cpython-310.pyc
+++ b/mace-bench/3rdparty/SevenNet/sevenn/scripts/__pycache__/__init__.cpython-310.pyc
--- a/mace-bench/3rdparty/SevenNet/sevenn/scripts/__pycache__/backward_compatibility.cpython-310.pyc
+++ b/mace-bench/3rdparty/SevenNet/sevenn/scripts/__pycache__/backward_compatibility.cpython-310.pyc
--- a/mace-bench/3rdparty/SevenNet/sevenn/scripts/backward_compatibility.py
+++ b/mace-bench/3rdparty/SevenNet/sevenn/scripts/backward_compatibility.py
+"""
+Debt
+keep old pre-trained checkpoints unchanged.
+"""
+
+import copy
+
+import torch
+
+import sevenn._keys as KEY
+
+
+def version_tuple(v1):
+    v1 = tuple(map(int, v1.split('.')))
+    return v1
+
+
+def patch_old_config(config):
+    version = config.get('version', None)
+    if not version:
+        raise ValueError('No version found in config')
+
+    major, minor, _ = version.split('.')[:3]
+    major, minor = int(major), int(minor)
+
+    if major == 0 and minor <= 9:
+        if config[KEY.CUTOFF_FUNCTION][KEY.CUTOFF_FUNCTION_NAME] == 'XPLOR':
+            config[KEY.CUTOFF_FUNCTION].pop('poly_cut_p_value', None)
+        if KEY.TRAIN_DENOMINTAOR not in config:
+            config[KEY.TRAIN_DENOMINTAOR] = config.pop('train_avg_num_neigh', False)
+        _opt = config.pop('optimize_by_reduce', None)
+        if _opt is False:
+            raise ValueError(
+                'This checkpoint(optimize_by_reduce: False) is no longer supported'
+            )
+        if KEY.CONV_DENOMINATOR not in config:
+            config[KEY.CONV_DENOMINATOR] = 0.0
+        if KEY._NORMALIZE_SPH not in config:
+            config[KEY._NORMALIZE_SPH] = False
+
+    return config
+
+
+def map_old_model(old_model_state_dict):
+    """
+    For compatibility with old namings (before 'correct' branch merged 2404XX)
+    Map old model's module names to new model's module names
+    """
+    _old_module_name_mapping = {
+        'EdgeEmbedding': 'edge_embedding',
+        'reducing nn input to hidden': 'reduce_input_to_hidden',
+        'reducing nn hidden to energy': 'reduce_hidden_to_energy',
+        'rescale atomic energy': 'rescale_atomic_energy',
+    }
+    for i in range(10):
+        _old_module_name_mapping[f'{i} self connection intro'] = (
+            f'{i}_self_connection_intro'
+        )
+        _old_module_name_mapping[f'{i} convolution'] = f'{i}_convolution'
+        _old_module_name_mapping[f'{i} self interaction 2'] = (
+            f'{i}_self_interaction_2'
+        )
+        _old_module_name_mapping[f'{i} equivariant gate'] = f'{i}_equivariant_gate'
+
+    new_model_state_dict = {}
+    for k, v in old_model_state_dict.items():
+        key_name = k.split('.')[0]
+        follower = '.'.join(k.split('.')[1:])
+        if 'denumerator' in follower:
+            follower = follower.replace('denumerator', 'denominator')
+        if key_name in _old_module_name_mapping:
+            new_key_name = _old_module_name_mapping[key_name] + '.' + follower
+            new_model_state_dict[new_key_name] = v
+        else:
+            new_model_state_dict[k] = v
+    return new_model_state_dict
+
+
+def sort_old_convolution(model_now, state_dict):
+    from e3nn.o3 import wigner_3j
+
+    """
+    Reason1: we have to sort instructions of convolution to be compatible with
+    cuEquivariance. (therefore, sort weight)
+    Reason2: some of old convolution module's w3j coeff has flipped sign. This also
+    has to be fixed to be compatible with cuEquivarinace.
+    """
+
+    def patch(stct):
+        inst_old = copy.copy(conv._instructions_before_sort)
+        inst_old = [(inst[0], inst[1], inst[2]) for inst in inst_old]
+        del conv._instructions_before_sort
+
+        conv_args = conv.convolution_kwargs
+        irreps_in1 = conv_args['irreps_in1']
+        irreps_in2 = conv_args['irreps_in2']
+        irreps_out = conv_args.get('irreps_out', conv_args.get('filter_irreps_out'))
+
+        inst_sorted = sorted(inst_old, key=lambda x: x[2])
+
+        inst_sorted = [
+            # in1, in2, out, weights
+            (inst[0], inst[1], inst[2], irreps_in1[inst[0]].mul)
+            for inst in inst_sorted
+        ]
+
+        n = len(weight_nn.hs) - 2
+        ww_key = f'{conv_key}.weight_nn.layer{n}.weight'
+        ww = stct[ww_key]
+        ww_sorted = [None] * len(inst_old)
+
+        _prev_idx = 0
+        for ist_src in inst_old:
+            for j, ist_dst in enumerate(inst_sorted):
+                if not all(ist_src[ii] == ist_dst[ii] for ii in range(3)):
+                    continue
+
+                numel = ist_dst[3]  # weight num
+                ww_src = ww[:, _prev_idx : _prev_idx + numel]
+                l1, l2, l3 = (
+                    irreps_in1[ist_src[0]].ir.l,
+                    irreps_in2[ist_src[1]].ir.l,
+                    irreps_out[ist_src[2]].ir.l,
+                )
+                if l1 > 0 and l2 > 0 and l3 > 0:
+                    w3j_key = f'_w3j_{l1}_{l2}_{l3}'
+                    conv_w3j_key = (
+                        f'{conv_key}.convolution._compiled_main_left_right.{w3j_key}'
+                    )
+                    w3j_old = stct[conv_w3j_key]
+                    w3j_now = wigner_3j(l1, l2, l3)
+                    if not torch.allclose(w3j_old.to(w3j_now.device), w3j_now):
+                        assert torch.allclose(
+                            w3j_old.to(w3j_now.device), -1 * w3j_now
+                        )
+                        ww_src = -1 * ww_src
+                        stct[conv_w3j_key] *= -1  # stct updated
+                _prev_idx += numel
+                ww_sorted[j] = ww_src
+        ww_sorted = torch.cat(ww_sorted, dim=1)  # type: ignore
+        stct[ww_key] = ww_sorted.clone()  # stct updated
+
+    conv_dicts = {}
+    for k, v in state_dict.items():
+        key_name = k.split('.')[0]
+        if key_name.split('_')[1] == 'convolution':
+            if key_name not in conv_dicts:
+                conv_dicts[key_name] = {}
+            conv_dicts[key_name].update({k: v})
+
+    new_state_dict = {}
+    new_state_dict.update(state_dict)
+    for conv_key, conv_state_dict in conv_dicts.items():
+        conv = model_now._modules[conv_key]
+        weight_nn = conv.weight_nn
+        patch(conv_state_dict)
+        new_state_dict.update(conv_state_dict)
+
+    return new_state_dict
+
+
+def patch_state_dict_if_old(state_dict, config_cp, now_model):
+    version = config_cp.get('version', None)
+    if not version:
+        raise ValueError('No version found in config')
+    vs = version.split('.')
+    vsuffix = ''
+    if len(vs) == 4:
+        vsuffix = vs[-1]
+        vs = version_tuple('.'.join(vs[:3]))
+    else:
+        vs = version_tuple('.'.join(vs))
+
+    if vs < version_tuple('0.10.0'):
+        state_dict = map_old_model(state_dict)
+
+    # TODO: change version criteria before release!!!
+    #       it causes problem if model is sorted but this function is called
+    #       ... more robust way? idk
+    if vs < version_tuple('0.11.0') or (
+        vs == version_tuple('0.11.0') and vsuffix == 'dev0'
+    ):
+        state_dict = sort_old_convolution(now_model, state_dict)
+    return state_dict