Unverified Commit ca86f720 authored by zcxzcx1's avatar zcxzcx1 Committed by GitHub
Browse files

Add files via upload

parent b75ed73c
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/, Sandia National Laboratories
LAMMPS development team: developers@lammps.org
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
PairStyle(d3, PairD3)
#else
#ifndef LMP_PAIR_D3
#define LMP_PAIR_D3
#include <cmath>
#include <string>
#include <vector>
#include <algorithm>
#include <map>
#include <unordered_map>
#include <cuda_runtime.h>
#include "pair.h"
#include "utils.h"
#include "atom.h"
#include "domain.h"
#include "error.h"
#include "comm.h"
#include "neighbor.h"
#include "neigh_list.h"
#include "math_extra.h"
#include "pair_d3_pars.h"
// Removed dependencies to STL
// #include <stdlib.h> -> no more C style functions
// #define _USE_MATH_DEFINES -> no predefined constants
// Removed dependencies to LAMMPS
// #include "potential_file_reader.h" -> removed, PotentialFileReader
// #include "memory.h" -> already no dependency for CUDA version
namespace LAMMPS_NS {
class PairD3 : public Pair {
public:
PairD3(class LAMMPS*);
~PairD3() override;
void compute(int, int) override;
void settings(int, char**) override;
void coeff(int, char**) override;
double init_one(int i, int j) override;
void init_style() override;
void write_restart(FILE*) override;
void read_restart(FILE*) override;
void write_restart_settings(FILE*) override;
void read_restart_settings(FILE*) override;
protected:
virtual void allocate();
/* ------- Read parameters ------- */
int find_atomic_number(std::string&);
int is_int_in_array(int*, int, int);
void read_r0ab(int*, int);
void get_limit_in_pars_array(int&, int&, int&, int&);
void read_c6ab(int*, int);
void setfuncpar_zero();
void setfuncpar_bj();
void setfuncpar_zerom();
void setfuncpar_bjm();
void setfuncpar();
/* ------- Read parameters ------- */
/* ------- Lattice information ------- */
void set_lattice_repetition_criteria(float, int*);
void set_lattice_vectors();
/* ------- Lattice information ------- */
/* ------- Initialize & Precalculate ------- */
void load_atom_info();
void precalculate_tau_array();
/* ------- Initialize & Precalculate ------- */
/* ------- Reallocate (when number of atoms changed) ------- */
void reallocate_arrays();
/* ------- Reallocate (when number of atoms changed) ------- */
/* ------- Coordination number ------- */
void get_coordination_number();
void get_dC6_dCNij();
/* ------- Coordination number ------- */
/* ------- Main workers ------- */
void get_forces_without_dC6_zero();
void get_forces_without_dC6_bj();
void get_forces_without_dC6_zerom();
void get_forces_without_dC6_bjm();
void get_forces_without_dC6();
void get_forces_with_dC6();
void update(int, int);
/* ------- Main workers ------- */
/*--------- Constants ---------*/
static constexpr int MAX_ELEM = 94; // maximum of the element number
static constexpr int MAXC = 5; // maximum coordination number references per element
static constexpr double AU_TO_ANG = 0.52917726; // conversion factors (atomic unit --> angstrom)
static constexpr double AU_TO_EV = 27.21138505; // conversion factors (atomic unit --> eV)
static constexpr float K1 = 16.0; // global ad hoc parameters
static constexpr float K3 = -4.0; // global ad hoc parameters
/*--------- Constants ---------*/
/*--------- Parameters to read ---------*/
int damping;
std::string functional;
float* r2r4 = nullptr; // scale r4/r2 values of the atoms by sqrt(Z)
float* rcov = nullptr; // covalent radii
int* mxc = nullptr; // How large the grid for c6 interpolation
float** r0ab = nullptr; // cut-off radii for all element pairs
float***** c6ab = nullptr; // C6 for all element pairs
float rthr; // R^2 distance to cutoff for C calculation
float cnthr; // R^2 distance to cutoff for CN_calculation
float s6, s8, s18, rs6, rs8, rs18, alp, alp6, alp8, a1, a2; // parameters for D3
/*--------- Parameters to read ---------*/
/*--------- Lattice related values ---------*/
double* lat_v_1 = nullptr; // lattice coordination vector
double* lat_v_2 = nullptr; // lattice coordination vector
double* lat_v_3 = nullptr; // lattice coordination vector
int* rep_vdw = nullptr; // repetition of cell for calculating D3
int* rep_cn = nullptr; // repetition of cell for calculating
double** sigma = nullptr; // virial pressure on cell
/*--------- Lattice related values ---------*/
/*--------- Per-atom values/arrays ---------*/
double* cn = nullptr; // Coordination numbers
float** x = nullptr; // Positions
double** f = nullptr; // Forces
double* dc6i = nullptr; // dC6i(iat) saves dE_dsp/dCN(iat)
/*--------- Per-atom values/arrays ---------*/
/*--------- Per-pair values/arrays ---------*/
float* c6_ij_tot = nullptr;
float* dc6_iji_tot = nullptr;
float* dc6_ijj_tot = nullptr;
/*--------- Per-pair values/arrays ---------*/
/*---------- Global values ---------*/
int n_save; // to check whether the number of atoms has changed
float disp_total; // Dispersion energy
/*---------- Global values ---------*/
/*--------- For loop over tau (translation of cell) ---------*/
float**** tau_vdw = nullptr;
float**** tau_cn = nullptr;
int* tau_idx_vdw = nullptr;
int* tau_idx_cn = nullptr;
int tau_idx_vdw_total_size;
int tau_idx_cn_total_size;
/*--------- For loop over tau (translation of cell) ---------*/
/*--------- For cuda memory transfer (pointerized) ---------*/
int *atomtype;
double *disp;
/*--------- For cuda memory transfer (pointerized) ---------*/
};
}
#endif // LMP_PAIR_D3
#endif // PAIR_CLASS
/*
This code is a skeleton of the LAMMPS pair_style d3 accelerated by CUDA.
All dependencies on LAMMPS have been removed.
The input and output variables are named based on the LAMMPS variables as much as possible.
*/
#include "pair_d3_for_ase.h"
/* --------- Macros for CUDA error handling --------- */
#define START_CUDA_TIMER() \
cudaEvent_t start, stop; \
cudaEventCreate(&start); \
cudaEventCreate(&stop); \
cudaEventRecord(start);
#define STOP_CUDA_TIMER(tag) \
cudaEventRecord(stop); \
cudaEventSynchronize(stop); \
float msec = 0; \
cudaEventElapsedTime(&msec, start, stop); \
printf("Elapsed time for %s: %f ms\n", tag, msec); \
cudaEventDestroy(start); \
cudaEventDestroy(stop);
#define CHECK_CUDA(call) do { \
cudaError_t status_ = call; \
if (status_ != cudaSuccess) { \
fprintf(stderr, "CUDA Error (%s:%d) -> %s: %s\n", __FILE__, __LINE__, \
cudaGetErrorName(status_), cudaGetErrorString(status_)); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define CHECK_CUDA_ERROR() do { \
cudaDeviceSynchronize(); \
cudaError_t status_ = cudaGetLastError(); \
if (status_ != cudaSuccess) { \
fprintf(stderr, "CUDA Error (%s:%d) -> %s: %s\n", __FILE__, __LINE__, \
cudaGetErrorName(status_), cudaGetErrorString(status_)); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define CHECK_CUDA_DEVICES() do { \
int deviceCount = 0; \
if (cudaGetDeviceCount(&deviceCount) != cudaSuccess || deviceCount == 0) { \
fprintf(stderr, "CUDA Error (%s:%d) -> No CUDA devices found\n", \
__FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} \
} while(0)
/* --------- Macros for CUDA error handling --------- */
/* --------- Math functions for CUDA compatibility --------- */
inline __host__ __device__ void ij_at_linij(int linij, int &i, int &j) {
i = static_cast<int>((sqrt(1 + 8 * linij) - 1) / 2);
j = linij - i * (i + 1) / 2;
} // unroll the triangular loop
inline __host__ __device__ float lensq3(const float *v)
{
return v[0] * v[0] + v[1] * v[1] + v[2] * v[2];
} // from MathExtra::lensq3
inline void cross3(const double *v1, const double *v2, double *ans)
{
ans[0] = v1[1] * v2[2] - v1[2] * v2[1];
ans[1] = v1[2] * v2[0] - v1[0] * v2[2];
ans[2] = v1[0] * v2[1] - v1[1] * v2[0];
}
inline double dot3(const double *v1, const double *v2)
{
return v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2];
}
inline double len3(const double *v)
{
return sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
}
/* --------- Math functions for CUDA compatibility --------- */
/* ----------------------------------------------------------------------
Constructor (Required)
------------------------------------------------------------------------- */
PairD3::PairD3() {
allocated = 0;
}
/* ----------------------------------------------------------------------
Destructor (Required)
------------------------------------------------------------------------- */
PairD3::~PairD3() {
if (allocated) {
int n = atom->natoms;
int np1 = atom->ntypes + 1;
int vdw_range_x = 2 * rep_vdw[0] + 1;
int vdw_range_y = 2 * rep_vdw[1] + 1;
int vdw_range_z = 2 * rep_vdw[2] + 1;
int cn_range_x = 2 * rep_cn[0] + 1;
int cn_range_y = 2 * rep_cn[1] + 1;
int cn_range_z = 2 * rep_cn[2] + 1;
//for (int i = 0; i < np1; i++) { cudaFree(setflag[i]); }; cudaFree(setflag);
//for (int i = 0; i < np1; i++) { cudaFree(cutsq[i]); }; cudaFree(cutsq);
cudaFree(r2r4);
cudaFree(rcov);
cudaFree(mxc);
for (int i = 0; i < np1; i++) { cudaFree(r0ab[i]); }; cudaFree(r0ab);
for (int i = 0; i < np1; i++) {
for (int j = 0; j < np1; j++) {
for (int k = 0; k < MAXC; k++) {
for (int l = 0; l < MAXC; l++) {
cudaFree(c6ab[i][j][k][l]);
}
cudaFree(c6ab[i][j][k]);
}
cudaFree(c6ab[i][j]);
}
cudaFree(c6ab[i]);
}
cudaFree(c6ab);
cudaFree(lat_v_1);
cudaFree(lat_v_2);
cudaFree(lat_v_3);
cudaFree(rep_vdw);
cudaFree(rep_cn);
cudaFree(cn);
for (int i = 0; i < n; i++) { cudaFree(x[i]); }; cudaFree(x);
cudaFree(dc6i);
for (int i = 0; i < n; i++) { cudaFree(f[i]); }; cudaFree(f);
for (int i = 0; i < 3; i++) { cudaFree(sigma[i]); }; cudaFree(sigma);
cudaFree(dc6_iji_tot);
cudaFree(dc6_ijj_tot);
cudaFree(c6_ij_tot);
for (int i = 0; i < vdw_range_x; i++) {
for (int j = 0; j < vdw_range_y; j++) {
for (int k = 0; k < vdw_range_z; k++) {
cudaFree(tau_vdw[i][j][k]);
}
cudaFree(tau_vdw[i][j]);
}
cudaFree(tau_vdw[i]);
}
cudaFree(tau_vdw);
for (int i = 0; i < cn_range_x; i++) {
for (int j = 0; j < cn_range_y; j++) {
for (int k = 0; k < cn_range_z; k++) {
cudaFree(tau_cn[i][j][k]);
}
cudaFree(tau_cn[i][j]);
}
cudaFree(tau_cn[i]);
}
cudaFree(tau_cn);
cudaFree(tau_idx_vdw);
cudaFree(tau_idx_cn);
cudaFree(atomtype);
cudaFree(disp);
}
}
/* ----------------------------------------------------------------------
Allocate all arrays (Required)
------------------------------------------------------------------------- */
void PairD3::allocate() {
CHECK_CUDA_DEVICES();
allocated = 1;
/* atom->ntypes : # of elements; element index starts from 1 */
int n = atom->natoms;
int np1 = atom->ntypes + 1;
n_save = n;
np1_save = np1;
//cudaMallocManaged(&setflag, np1 * sizeof(int*)); for (int i = 0; i < np1; i++) { cudaMallocManaged(&setflag[i], np1 * sizeof(int)); }
//cudaMallocManaged(&cutsq, np1 * sizeof(double*)); for (int i = 0; i < np1; i++) { cudaMallocManaged(&cutsq[i], np1 * sizeof(double)); }
cudaMallocManaged(&r2r4, np1 * sizeof(float));
cudaMallocManaged(&rcov, np1 * sizeof(float));
cudaMallocManaged(&mxc, np1 * sizeof(int));
cudaMallocManaged(&r0ab, np1 * sizeof(float*)); for (int i = 0; i < np1; i++) { cudaMallocManaged(&r0ab[i], np1 * sizeof(float)); }
cudaMallocManaged(&c6ab, np1 * sizeof(float****));
for (int i = 0; i < np1; i++) {
cudaMallocManaged(&c6ab[i], np1 * sizeof(float***));
for (int j = 0; j < np1; j++) {
cudaMallocManaged(&c6ab[i][j], MAXC * sizeof(float**));
for (int k = 0; k < MAXC; k++) {
cudaMallocManaged(&c6ab[i][j][k], MAXC * sizeof(float*));
for (int l = 0; l < MAXC; l++) {
cudaMallocManaged(&c6ab[i][j][k][l], 3 * sizeof(float));
}
}
}
}
cudaMallocManaged(&lat_v_1, 3 * sizeof(float));
cudaMallocManaged(&lat_v_2, 3 * sizeof(float));
cudaMallocManaged(&lat_v_3, 3 * sizeof(float));
cudaMallocManaged(&rep_vdw, 3 * sizeof(int));
cudaMallocManaged(&rep_cn, 3 * sizeof(int));
cudaMallocManaged(&sigma, 3 * sizeof(double*)); for (int i = 0; i < 3; i++) { cudaMallocManaged(&sigma[i], 3 * sizeof(double)); }
cudaMallocManaged(&cn, n * sizeof(double));
cudaMallocManaged(&x, n * sizeof(float*)); for (int i = 0; i < n; i++) { cudaMallocManaged(&x[i], 3 * sizeof(float)); }
cudaMallocManaged(&dc6i, n * sizeof(double));
cudaMallocManaged(&f, n * sizeof(double*)); for (int i = 0; i < n; i++) { cudaMallocManaged(&f[i], 3 * sizeof(double)); }
// Initialization
// Initialize for lattice -> set_lattice_vectors()
tau_idx_vdw_total_size = -1;
tau_idx_cn_total_size = -1;
for (int i = 0; i < 3; i++) {
rep_vdw[i] = -1;
rep_cn[i] = -1;
}
//for (int i = 1; i < np1; i++) {
// for (int j = 1; j < np1; j++) {
// setflag[i][j] = 0;
// }
//}
for (int idx1 = 0; idx1 < np1; idx1++) {
for (int idx2 = 0; idx2 < np1; idx2++) {
for (int idx3 = 0; idx3 < MAXC; idx3++) {
for (int idx4 = 0; idx4 < MAXC; idx4++) {
for (int idx5 = 0; idx5 < 3; idx5++) {
c6ab[idx1][idx2][idx3][idx4][idx5] = -1;
}
}
}
}
}
int n_ij_combination = n * (n + 1) / 2;
cudaMallocManaged(&dc6_iji_tot, n_ij_combination * sizeof(float));
cudaMallocManaged(&dc6_ijj_tot, n_ij_combination * sizeof(float));
cudaMallocManaged(&c6_ij_tot, n_ij_combination * sizeof(float));
cudaMallocManaged(&atomtype, n * sizeof(int));
cudaMallocManaged(&disp, sizeof(double));
}
/* ----------------------------------------------------------------------
Settings : read from pair_style (Required) -> pair_style d3 vdw_sq cn_sq damp_name func_name
------------------------------------------------------------------------- */
void PairD3::settings(double vdw_sq, double cn_sq, std::string damp_name, std::string func_name) {
rthr = vdw_sq;
cnthr = cn_sq;
std::map<std::string, int> commandMap = {
{"damp_zero", 0}, {"damp_bj", 1}, {"damp_zerom", 2}, {"damp_bjm", 3},
};
if (commandMap.find(damp_name) == commandMap.end()) {
error->all(FLERR, "Unknown damping function");
}
damping = commandMap[damp_name];
functional = func_name;
setfuncpar();
}
/* ----------------------------------------------------------------------
finds atomic number (used in PairD3::coeff)
------------------------------------------------------------------------- */
int PairD3::find_atomic_number(std::string& key) {
std::transform(key.begin(), key.end(), key.begin(), ::tolower);
if (key.length() == 1) { key += " "; }
key.resize(2);
std::vector<std::string> element_table = {
"h ","he",
"li","be","b ","c ","n ","o ","f ","ne",
"na","mg","al","si","p ","s ","cl","ar",
"k ","ca","sc","ti","v ","cr","mn","fe","co","ni","cu",
"zn","ga","ge","as","se","br","kr",
"rb","sr","y ","zr","nb","mo","tc","ru","rh","pd","ag",
"cd","in","sn","sb","te","i ","xe",
"cs","ba","la","ce","pr","nd","pm","sm","eu","gd","tb","dy",
"ho","er","tm","yb","lu","hf","ta","w ","re","os","ir","pt",
"au","hg","tl","pb","bi","po","at","rn",
"fr","ra","ac","th","pa","u ","np","pu"
};
for (size_t i = 0; i < element_table.size(); ++i) {
if (element_table[i] == key) {
int atomic_number = i + 1;
return atomic_number;
}
}
// if not the case
return -1;
}
/* ----------------------------------------------------------------------
Check whether an integer value in an integer array (used in PairD3::coeff)
------------------------------------------------------------------------- */
int PairD3::is_int_in_array(int arr[], int size, int value) {
for (int i = 0; i < size; i++) {
if (arr[i] == value) { return i; } // returns the index
}
return -1;
}
/* ----------------------------------------------------------------------
Read r0ab values from the table (used in PairD3::coeff)
------------------------------------------------------------------------- */
void PairD3::read_r0ab(int* atomic_numbers, int ntypes) {
const double r0ab_table[94][94] = R0AB_TABLE;
for (int i = 1; i <= ntypes; i++) {
for (int j = 1; j <= ntypes; j++) {
r0ab[i][j] = r0ab_table[atomic_numbers[i-1]-1][atomic_numbers[j-1]-1] / AU_TO_ANG;
}
}
}
/* ----------------------------------------------------------------------
Get atom pair indices and grid indices (used in PairD3::read_c6ab)
------------------------------------------------------------------------- */
void PairD3::get_limit_in_pars_array(int& idx_atom_1, int& idx_atom_2, int& idx_i, int& idx_j) {
const int shift = 100;
idx_i = (idx_atom_1 - 1) / shift + 1;
idx_j = (idx_atom_2 - 1) / shift + 1;
idx_atom_1 = (idx_atom_1 - 1) % shift + 1;
idx_atom_2 = (idx_atom_2 - 1) % shift + 1;
// the code above replaces the code below
//idx_i = 1;
//idx_j = 1;
//int shift = 100;
//while (idx_atom_1 > shift) { idx_atom_1 -= shift; idx_i++; }
//while (idx_atom_2 > shift) { idx_atom_2 -= shift; idx_j++; }
}
/* ----------------------------------------------------------------------
Read c6ab values from the table (used in PairD3::coeff)
------------------------------------------------------------------------- */
void PairD3::read_c6ab(int* atomic_numbers, int ntypes) {
for (int i = 1; i <= ntypes; i++) { mxc[i] = 0; }
int grid_i = 0, grid_j = 0;
const double c6ab_table[32385][5] = C6AB_TABLE;
for (int i = 0; i < 32385; i++) {
const double ref_c6 = c6ab_table[i][0];
int atom_number_1 = static_cast<int>(c6ab_table[i][1]);
int atom_number_2 = static_cast<int>(c6ab_table[i][2]);
get_limit_in_pars_array(atom_number_1, atom_number_2, grid_i, grid_j);
const int idx_atom_1 = is_int_in_array(atomic_numbers, ntypes, atom_number_1);
if (idx_atom_1 < 0) { continue; }
const int idx_atom_2 = is_int_in_array(atomic_numbers, ntypes, atom_number_2);
if (idx_atom_2 < 0) { continue; }
const double ref_cn1 = c6ab_table[i][3];
const double ref_cn2 = c6ab_table[i][4];
mxc[idx_atom_1 + 1] = std::max(mxc[idx_atom_1 + 1], grid_i);
mxc[idx_atom_2 + 1] = std::max(mxc[idx_atom_2 + 1], grid_j);
c6ab[idx_atom_1 + 1][idx_atom_2 + 1][grid_i - 1][grid_j - 1][0] = ref_c6;
c6ab[idx_atom_1 + 1][idx_atom_2 + 1][grid_i - 1][grid_j - 1][1] = ref_cn1;
c6ab[idx_atom_1 + 1][idx_atom_2 + 1][grid_i - 1][grid_j - 1][2] = ref_cn2;
c6ab[idx_atom_2 + 1][idx_atom_1 + 1][grid_j - 1][grid_i - 1][0] = ref_c6;
c6ab[idx_atom_2 + 1][idx_atom_1 + 1][grid_j - 1][grid_i - 1][1] = ref_cn2;
c6ab[idx_atom_2 + 1][idx_atom_1 + 1][grid_j - 1][grid_i - 1][2] = ref_cn1;
}
}
/* ----------------------------------------------------------------------
Set functional parameters (used in PairD3::coeff)
------------------------------------------------------------------------- */
void PairD3::setfuncpar_zero() {
s6 = 1.0;
alp = 14.0;
rs18 = 1.0;
// default def2-QZVP (almost basis set limit)
std::unordered_map<std::string, int> commandMap = {
{ "slater-dirac-exchange", 1}, { "b-lyp", 2 }, { "b-p", 3 }, { "b97-d", 4 }, { "revpbe", 5 },
{ "pbe", 6 }, { "pbesol", 7 }, { "rpw86-pbe", 8 }, { "rpbe", 9 }, { "tpss", 10 },
{ "b3-lyp", 11 }, { "pbe0", 12 }, { "hse06", 13 }, { "revpbe38", 14 }, { "pw6b95", 15 },
{ "tpss0", 16 }, { "b2-plyp", 17 }, { "pwpb95", 18 }, { "b2gp-plyp", 19 }, { "ptpss", 20 },
{ "hf", 21 }, { "mpwlyp", 22 }, { "bpbe", 23 }, { "bh-lyp", 24 }, { "tpssh", 25 },
{ "pwb6k", 26 }, { "b1b95", 27 }, { "bop", 28 }, { "o-lyp", 29 }, { "o-pbe", 30 },
{ "ssb", 31 }, { "revssb", 32 }, { "otpss", 33 }, { "b3pw91", 34 }, { "revpbe0", 35 },
{ "pbe38", 36 }, { "mpw1b95", 37 }, { "mpwb1k", 38 }, { "bmk", 39 }, { "cam-b3lyp", 40 },
{ "lc-wpbe", 41 }, { "m05", 42 }, { "m052x", 43 }, { "m06l", 44 }, { "m06", 45 },
{ "m062x", 46 }, { "m06hf", 47 }, { "hcth120", 48 }
};
int commandCode = commandMap[functional];
switch (commandCode) {
case 1: rs6 = 0.999; s18 = -1.957; rs18 = 0.697; break;
case 2: rs6 = 1.094; s18 = 1.682; break;
case 3: rs6 = 1.139; s18 = 1.683; break;
case 4: rs6 = 0.892; s18 = 0.909; break;
case 5: rs6 = 0.923; s18 = 1.010; break;
case 6: rs6 = 1.217; s18 = 0.722; break;
case 7: rs6 = 1.345; s18 = 0.612; break;
case 8: rs6 = 1.224; s18 = 0.901; break;
case 9: rs6 = 0.872; s18 = 0.514; break;
case 10: rs6 = 1.166; s18 = 1.105; break;
case 11: rs6 = 1.261; s18 = 1.703; break;
case 12: rs6 = 1.287; s18 = 0.928; break;
case 13: rs6 = 1.129; s18 = 0.109; break;
case 14: rs6 = 1.021; s18 = 0.862; break;
case 15: rs6 = 1.532; s18 = 0.862; break;
case 16: rs6 = 1.252; s18 = 1.242; break;
case 17: rs6 = 1.427; s18 = 1.022; s6 = 0.64; break;
case 18: rs6 = 1.557; s18 = 0.705; s6 = 0.82; break;
case 19: rs6 = 1.586; s18 = 0.760; s6 = 0.56; break;
case 20: rs6 = 1.541; s18 = 0.879; s6 = 0.75; break;
case 21: rs6 = 1.158; s18 = 1.746; break;
case 22: rs6 = 1.239; s18 = 1.098; break;
case 23: rs6 = 1.087; s18 = 2.033; break;
case 24: rs6 = 1.370; s18 = 1.442; break;
case 25: rs6 = 1.223; s18 = 1.219; break;
case 26: rs6 = 1.660; s18 = 0.550; break;
case 27: rs6 = 1.613; s18 = 1.868; break;
case 28: rs6 = 0.929; s18 = 1.975; break;
case 29: rs6 = 0.806; s18 = 1.764; break;
case 30: rs6 = 0.837; s18 = 2.055; break;
case 31: rs6 = 1.215; s18 = 0.663; break;
case 32: rs6 = 1.221; s18 = 0.560; break;
case 33: rs6 = 1.128; s18 = 1.494; break;
case 34: rs6 = 1.176; s18 = 1.775; break;
case 35: rs6 = 0.949; s18 = 0.792; break;
case 36: rs6 = 1.333; s18 = 0.998; break;
case 37: rs6 = 1.605; s18 = 1.118; break;
case 38: rs6 = 1.671; s18 = 1.061; break;
case 39: rs6 = 1.931; s18 = 2.168; break;
case 40: rs6 = 1.378; s18 = 1.217; break;
case 41: rs6 = 1.355; s18 = 1.279; break;
case 42: rs6 = 1.373; s18 = 0.595; break;
case 43: rs6 = 1.417; s18 = 0.000; break;
case 44: rs6 = 1.581; s18 = 0.000; break;
case 45: rs6 = 1.325; s18 = 0.000; break;
case 46: rs6 = 1.619; s18 = 0.000; break;
case 47: rs6 = 1.446; s18 = 0.000; break;
/* DFTB3(zeta = 4.0), old deprecated parameters; case ("dftb3"); rs6 = 1.235; s18 = 0.673; */
case 48: rs6 = 1.221; s18 = 1.206; break;
default:
error->all(FLERR, "Functional name unknown");
break;
}
}
void PairD3::setfuncpar_bj() {
s6 = 1.0;
alp = 14.0;
std::unordered_map<std::string, int> commandMap = {
{"b-p", 1}, {"b-lyp", 2}, {"revpbe", 3}, {"rpbe", 4}, {"b97-d", 5}, {"pbe", 6},
{"rpw86-pbe", 7}, {"b3-lyp", 8}, {"tpss", 9}, {"hf", 10}, {"tpss0", 11}, {"pbe0", 12},
{"hse06", 13}, {"revpbe38", 14}, {"pw6b95", 15}, {"b2-plyp", 16}, {"dsd-blyp", 17},
{"dsd-blyp-fc", 18}, {"bop", 19}, {"mpwlyp", 20}, {"o-lyp", 21}, {"pbesol", 22}, {"bpbe", 23},
{"opbe", 24}, {"ssb", 25}, {"revssb", 26}, {"otpss", 27}, {"b3pw91", 28}, {"bh-lyp", 29},
{"revpbe0", 30}, {"tpssh", 31}, {"mpw1b95", 32}, {"pwb6k", 33}, {"b1b95", 34}, {"bmk", 35},
{"cam-b3lyp", 36}, {"lc-wpbe", 37}, {"b2gp-plyp", 38}, {"ptpss", 39}, {"pwpb95", 40},
{"hf/mixed", 41}, {"hf/sv", 42}, {"hf/minis", 43}, {"b3-lyp/6-31gd", 44}, {"hcth120", 45},
{"pw1pw", 46}, {"pwgga", 47}, {"hsesol", 48}, {"hf3c", 49}, {"hf3cv", 50}, {"pbeh3c", 51},
{"pbeh-3c", 52}, {"wb97m", 53}
};
int commandCode = commandMap[functional];
switch (commandCode) {
case 1: rs6 = 0.3946; s18 = 3.2822; rs18 = 4.8516; break;
case 2: rs6 = 0.4298; s18 = 2.6996; rs18 = 4.2359; break;
case 3: rs6 = 0.5238; s18 = 2.3550; rs18 = 3.5016; break;
case 4: rs6 = 0.1820; s18 = 0.8318; rs18 = 4.0094; break;
case 5: rs6 = 0.5545; s18 = 2.2609; rs18 = 3.2297; break;
case 6: rs6 = 0.4289; s18 = 0.7875; rs18 = 4.4407; break;
case 7: rs6 = 0.4613; s18 = 1.3845; rs18 = 4.5062; break;
case 8: rs6 = 0.3981; s18 = 1.9889; rs18 = 4.4211; break;
case 9: rs6 = 0.4535; s18 = 1.9435; rs18 = 4.4752; break;
case 10: rs6 = 0.3385; s18 = 0.9171; rs18 = 2.8830; break;
case 11: rs6 = 0.3768; s18 = 1.2576; rs18 = 4.5865; break;
case 12: rs6 = 0.4145; s18 = 1.2177; rs18 = 4.8593; break;
case 13: rs6 = 0.383; s18 = 2.310; rs18 = 5.685; break;
case 14: rs6 = 0.4309; s18 = 1.4760; rs18 = 3.9446; break;
case 15: rs6 = 0.2076; s18 = 0.7257; rs18 = 6.3750; break;
case 16: rs6 = 0.3065; s18 = 0.9147; rs18 = 5.0570; break; s6 = 0.64;
case 17: rs6 = 0.0000; s18 = 0.2130; rs18 = 6.0519; s6 = 0.50; break;
case 18: rs6 = 0.0009; s18 = 0.2112; rs18 = 5.9807; s6 = 0.50; break;
case 19: rs6 = 0.4870; s18 = 3.2950; rs18 = 3.5043; break;
case 20: rs6 = 0.4831; s18 = 2.0077; rs18 = 4.5323; break;
case 21: rs6 = 0.5299; s18 = 2.6205; rs18 = 2.8065; break;
case 22: rs6 = 0.4466; s18 = 2.9491; rs18 = 6.1742; break;
case 23: rs6 = 0.4567; s18 = 4.0728; rs18 = 4.3908; break;
case 24: rs6 = 0.5512; s18 = 3.3816; rs18 = 2.9444; break;
case 25: rs6 = -0.0952; s18 = -0.1744; rs18 = 5.2170; break;
case 26: rs6 = 0.4720; s18 = 0.4389; rs18 = 4.0986; break;
case 27: rs6 = 0.4634; s18 = 2.7495; rs18 = 4.3153; break;
case 28: rs6 = 0.4312; s18 = 2.8524; rs18 = 4.4693; break;
case 29: rs6 = 0.2793; s18 = 1.0354; rs18 = 4.9615; break;
case 30: rs6 = 0.4679; s18 = 1.7588; rs18 = 3.7619; break;
case 31: rs6 = 0.4529; s18 = 2.2382; rs18 = 4.6550; break;
case 32: rs6 = 0.1955; s18 = 1.0508; rs18 = 6.4177; break;
case 33: rs6 = 0.1805; s18 = 0.9383; rs18 = 7.7627; break;
case 34: rs6 = 0.2092; s18 = 1.4507; rs18 = 5.5545; break;
case 35: rs6 = 0.1940; s18 = 2.0860; rs18 = 5.9197; break;
case 36: rs6 = 0.3708; s18 = 2.0674; rs18 = 5.4743; break;
case 37: rs6 = 0.3919; s18 = 1.8541; rs18 = 5.0897; break;
case 38: rs6 = 0.0000; s18 = 0.2597; rs18 = 6.3332; s6 = 0.560; break;
case 39: rs6 = 0.0000; s18 = 0.2804; rs18 = 6.5745; s6 = 0.750; break;
case 40: rs6 = 0.0000; s18 = 0.2904; rs18 = 7.3141; s6 = 0.820; break;
// special HF / DFT with eBSSE correction;
case 41: rs6 = 0.5607; s18 = 3.9027; rs18 = 4.5622; break;
case 42: rs6 = 0.4249; s18 = 2.1849; rs18 = 4.2783; break;
case 43: rs6 = 0.1702; s18 = 0.9841; rs18 = 3.8506; break;
case 44: rs6 = 0.5014; s18 = 4.0672; rs18 = 4.8409; break;
case 45: rs6 = 0.3563; s18 = 1.0821; rs18 = 4.3359; break;
/* DFTB3 old, deprecated parameters : ;
* case ("dftb3"); rs6 = 0.7461; s18 = 3.209; rs18 = 4.1906;
* special SCC - DFTB parametrization;
* full third order DFTB, self consistent charges, hydrogen pair damping with; exponent 4.2;
*/
case 46: rs6 = 0.3807; s18 = 2.3363; rs18 = 5.8844; break;
case 47: rs6 = 0.2211; s18 = 2.6910; rs18 = 6.7278; break;
case 48: rs6 = 0.4650; s18 = 2.9215; rs18 = 6.2003; break;
// special HF - D3 - gCP - SRB / MINIX parametrization;
case 49: rs6 = 0.4171; s18 = 0.8777; rs18 = 2.9149; break;
// special HF - D3 - gCP - SRB2 / ECP - 2G parametrization;
case 50: rs6 = 0.3063; s18 = 0.5022; rs18 = 3.9856; break;
// special PBEh - D3 - gCP / def2 - mSVP parametrization;
case 51: rs6 = 0.4860; s18 = 0.0000; rs18 = 4.5000; break;
case 52: rs6 = 0.4860; s18 = 0.0000; rs18 = 4.5000; break;
case 53: rs6 = 0.5660; s18 = 0.3908; rs18 = 3.1280; break;
default:
error->all(FLERR, "Functional name unknown");
break;
}
}
void PairD3::setfuncpar_zerom() {
s6 = 1.0;
alp = 14.0;
std::unordered_map<std::string, int> commandMap = {
{"b2-plyp", 1}, {"b3-lyp", 2}, {"b97-d", 3}, {"b-lyp", 4},
{"b-p", 5}, {"pbe", 6}, {"pbe0", 7}, {"lc-wpbe", 8}
};
int commandCode = commandMap[functional];
switch (commandCode) {
case 1: rs6 = 1.313134; s18 = 0.717543; rs18 = 0.016035; s6 = 0.640000; break;
case 2: rs6 = 1.338153; s18 = 1.532981; rs18 = 0.013988; break;
case 3: rs6 = 1.151808; s18 = 1.020078; rs18 = 0.035964; break;
case 4: rs6 = 1.279637; s18 = 1.841686; rs18 = 0.014370; break;
case 5: rs6 = 1.233460; s18 = 1.945174; rs18 = 0.000000; break;
case 6: rs6 = 2.340218; s18 = 0.000000; rs18 = 0.129434; break;
case 7: rs6 = 2.077949; s18 = 0.000081; rs18 = 0.116755; break;
case 8: rs6 = 1.366361; s18 = 1.280619; rs18 = 0.003160; break;
default:
error->all(FLERR, "Functional name unknown");
break;
}
}
void PairD3::setfuncpar_bjm() {
s6 = 1.0;
alp = 14.0;
std::unordered_map<std::string, int> commandMap = {
{"b2-plyp", 1}, {"b3-lyp", 2}, {"b97-d", 3}, {"b-lyp", 4},
{"b-p", 5}, {"pbe", 6}, {"pbe0", 7}, {"lc-wpbe", 8}
};
int commandCode = commandMap[functional];
switch (commandCode) {
case 1: rs6 = 0.486434; s18 = 0.672820; rs18 = 3.656466; s6 = 0.640000; break;
case 2: rs6 = 0.278672; s18 = 1.466677; rs18 = 4.606311; break;
case 3: rs6 = 0.240184; s18 = 1.206988; rs18 = 3.864426; break;
case 4: rs6 = 0.448486; s18 = 1.875007; rs18 = 3.610679; break;
case 5: rs6 = 0.821850; s18 = 3.140281; rs18 = 2.728151; break;
case 6: rs6 = 0.012092; s18 = 0.358940; rs18 = 5.938951; break;
case 7: rs6 = 0.007912; s18 = 0.528823; rs18 = 6.162326; break;
case 8: rs6 = 0.563761; s18 = 0.906564; rs18 = 3.593680; break;
default:
error->all(FLERR, "Functional name unknown");
break;
}
}
void PairD3::setfuncpar() {
void (PairD3::*setfuncpar_damp[4])() = {
&PairD3::setfuncpar_zero,
&PairD3::setfuncpar_bj,
&PairD3::setfuncpar_zerom,
&PairD3::setfuncpar_bjm
};
(this->*setfuncpar_damp[damping])();
rs8 = rs18;
alp6 = alp;
alp8 = alp + 2.0;
// rs10 = rs18
// alp10 = alp + 4.0;
a1 = rs6;
a2 = rs8;
s8 = s18;
// s6 is already defined
}
/* ----------------------------------------------------------------------
Coeff : read from pair_coeff (Required) -> pair_coeff * * element1 element2 ...
------------------------------------------------------------------------- */
void PairD3::coeff(int* atomic_numbers) {
if (!allocated) allocate();
if (atom->ntypes + 1 != np1_save) { reallocate_arrays_np1(); }
int ntypes = atom->ntypes;
/*
scale r4/r2 values of the atoms by sqrt(Z)
sqrt is also globally close to optimum
together with the factor 1/2 this yield reasonable
c8 for he, ne and ar. for larger Z, C8 becomes too large
which effectively mimics higher R^n terms neglected due
to stability reasons
r2r4 =sqrt(0.5*r2r4(i)*dfloat(i)**0.5 ) with i=elementnumber
the large number of digits is just to keep the results consistent
with older versions. They should not imply any higher accuracy than
the old values
*/
double r2r4_ref[94] = {
2.00734898, 1.56637132, 5.01986934, 3.85379032, 3.64446594,
3.10492822, 2.71175247, 2.59361680, 2.38825250, 2.21522516,
6.58585536, 5.46295967, 5.65216669, 4.88284902, 4.29727576,
4.04108902, 3.72932356, 3.44677275, 7.97762753, 7.07623947,
6.60844053, 6.28791364, 6.07728703, 5.54643096, 5.80491167,
5.58415602, 5.41374528, 5.28497229, 5.22592821, 5.09817141,
6.12149689, 5.54083734, 5.06696878, 4.87005108, 4.59089647,
4.31176304, 9.55461698, 8.67396077, 7.97210197, 7.43439917,
6.58711862, 6.19536215, 6.01517290, 5.81623410, 5.65710424,
5.52640661, 5.44263305, 5.58285373, 7.02081898, 6.46815523,
5.98089120, 5.81686657, 5.53321815, 5.25477007, 11.02204549,
10.15679528, 9.35167836, 9.06926079, 8.97241155, 8.90092807,
8.85984840, 8.81736827, 8.79317710, 7.89969626, 8.80588454,
8.42439218, 8.54289262, 8.47583370, 8.45090888, 8.47339339,
7.83525634, 8.20702843, 7.70559063, 7.32755997, 7.03887381,
6.68978720, 6.05450052, 5.88752022, 5.70661499, 5.78450695,
7.79780729, 7.26443867, 6.78151984, 6.67883169, 6.39024318,
6.09527958, 11.79156076, 11.10997644, 9.51377795, 8.67197068,
8.77140725, 8.65402716, 8.53923501, 8.85024712
}; // atomic <r^2>/<r^4> values
/*
covalent radii (taken from Pyykko and Atsumi, Chem. Eur. J. 15, 2009, 188-197)
values for metals decreased by 10 %
! data rcov/
! . 0.32, 0.46, 1.20, 0.94, 0.77, 0.75, 0.71, 0.63, 0.64, 0.67
! ., 1.40, 1.25, 1.13, 1.04, 1.10, 1.02, 0.99, 0.96, 1.76, 1.54
! ., 1.33, 1.22, 1.21, 1.10, 1.07, 1.04, 1.00, 0.99, 1.01, 1.09
! ., 1.12, 1.09, 1.15, 1.10, 1.14, 1.17, 1.89, 1.67, 1.47, 1.39
! ., 1.32, 1.24, 1.15, 1.13, 1.13, 1.08, 1.15, 1.23, 1.28, 1.26
! ., 1.26, 1.23, 1.32, 1.31, 2.09, 1.76, 1.62, 1.47, 1.58, 1.57
! ., 1.56, 1.55, 1.51, 1.52, 1.51, 1.50, 1.49, 1.49, 1.48, 1.53
! ., 1.46, 1.37, 1.31, 1.23, 1.18, 1.16, 1.11, 1.12, 1.13, 1.32
! ., 1.30, 1.30, 1.36, 1.31, 1.38, 1.42, 2.01, 1.81, 1.67, 1.58
! ., 1.52, 1.53, 1.54, 1.55 /
these new data are scaled with k2=4./3. and converted a_0 via
autoang=0.52917726d0
*/
double rcov_ref[94] = {
0.80628308, 1.15903197, 3.02356173, 2.36845659, 1.94011865,
1.88972601, 1.78894056, 1.58736983, 1.61256616, 1.68815527,
3.52748848, 3.14954334, 2.84718717, 2.62041997, 2.77159820,
2.57002732, 2.49443835, 2.41884923, 4.43455700, 3.88023730,
3.35111422, 3.07395437, 3.04875805, 2.77159820, 2.69600923,
2.62041997, 2.51963467, 2.49443835, 2.54483100, 2.74640188,
2.82199085, 2.74640188, 2.89757982, 2.77159820, 2.87238349,
2.94797246, 4.76210950, 4.20778980, 3.70386304, 3.50229216,
3.32591790, 3.12434702, 2.89757982, 2.84718717, 2.84718717,
2.72120556, 2.89757982, 3.09915070, 3.22513231, 3.17473967,
3.17473967, 3.09915070, 3.32591790, 3.30072128, 5.26603625,
4.43455700, 4.08180818, 3.70386304, 3.98102289, 3.95582657,
3.93062995, 3.90543362, 3.80464833, 3.82984466, 3.80464833,
3.77945201, 3.75425569, 3.75425569, 3.72905937, 3.85504098,
3.67866672, 3.45189952, 3.30072128, 3.09915070, 2.97316878,
2.92277614, 2.79679452, 2.82199085, 2.84718717, 3.32591790,
3.27552496, 3.27552496, 3.42670319, 3.30072128, 3.47709584,
3.57788113, 5.06446567, 4.56053862, 4.20778980, 3.98102289,
3.82984466, 3.85504098, 3.88023730, 3.90543362
}; // covalent radii
for (int i = 0; i < ntypes; i++) {
r2r4[i+1] = r2r4_ref[atomic_numbers[i]-1];
rcov[i+1] = rcov_ref[atomic_numbers[i]-1];
}
// set r0ab
read_r0ab(atomic_numbers, ntypes);
// read c6ab
read_c6ab(atomic_numbers, ntypes);
}
/* ----------------------------------------------------------------------
Get derivative of C6 w.r.t. CN (used in PairD3::compute)
C6 = C6(CN_A, CN_B) == W(CN_A, CN_B) / Z(CN_A, CN_B)
This gives below from chain rule:
d(C6)/dr = d(C6)/d(CN_A) * d(CN_A)/dr + d(C6)/d(CN_B) * d(CN_B)/dr
So we can pre-calculate the d(C6)/d(CN_A), d(C6)/d(CN_B) part.
d(C6)/d(CN_i) = (dW/d(CN_i) * Z - W * dZ/d(CN_i)) / (W * W)
W : "denominator"
Z : "numerator"
dW/d(CN_i) : "d_denominator_i"
dZ/d(CN_j) : "d_numerator_j"
Z = Sum( L_ij(CN_A, CN_B) * C6_ref(CN_A_i, CN_B_j) ) over i, j
W = Sum( L_ij(CN_A, CN_B) ) over i, j
And the resulting derivative term is saved into
"dc6_iji_tot", "dc6_ijj_tot" array,
where we can find the value of d(C6)/d(CN_i)
by knowing the index of "iat", and "jat". ("idx_linij")
Also, c6 values will also be saved into "c6_ij_tot" array.
Here, as we only interested in *pair* of atoms, assume "iat" >= "jat".
Then "idx_linij" = "jat + (iat + 1) * iat / 2" have the order below.
idx_linij | j = 0 j = 1 j = 2 j = 3 ...
---------------------------------------------
i = 0 | 0
i = 1 | 1 2
i = 2 | 3 4 5
i = 3 | 6 7 8 9
... | ... ... ... ... ...
------------------------------------------------------------------------- */
__global__ void kernel_get_dC6_dCNij(
int maxij, float K3,
double *cn, int *mxc, float *****c6ab, int *type,
float *c6_ij_tot, float *dc6_iji_tot, float *dc6_ijj_tot
) {
int iter = blockIdx.x * blockDim.x + threadIdx.x;
if (iter < maxij) {
int iat, jat;
ij_at_linij(iter, iat, jat);
const int atomtype_i = type[iat];
const int atomtype_j = type[jat];
const float cni = cn[iat];
const int mxci = mxc[atomtype_i];
const float cnj = cn[jat];
const int mxcj = mxc[atomtype_j];
float c6mem = -1e99f;
float r_save = 9999.0f;
double numerator = 0.0;
double denominator = 0.0;
double d_numerator_i = 0.0;
double d_denominator_i = 0.0;
double d_numerator_j = 0.0;
double d_denominator_j = 0.0;
for (int a = 0; a < mxci; a++) {
for (int b = 0; b < mxcj; b++) {
float c6ref = c6ab[atomtype_i][atomtype_j][a][b][0];
if (c6ref > 0.0f) {
float cn_refi = c6ab[atomtype_i][atomtype_j][a][b][1];
float cn_refj = c6ab[atomtype_i][atomtype_j][a][b][2];
float r = (cn_refi - cni) * (cn_refi - cni) + (cn_refj - cnj) * (cn_refj - cnj);
if (r < r_save) {
r_save = r;
c6mem = c6ref;
}
double expterm = exp(static_cast<double>(K3) * static_cast<double>(r)); // must be double
numerator += c6ref * expterm;
denominator += expterm;
expterm *= 2.0f * K3;
double term = expterm * (cni - cn_refi);
d_numerator_i += c6ref * term;
d_denominator_i += term;
term = expterm * (cnj - cn_refj);
d_numerator_j += c6ref * term;
d_denominator_j += term;
}
}
}
if (denominator > 1e-99) {
const double denominator_rc = 1.0 / denominator; // must be double
const double unit_frac = numerator * denominator_rc;
c6_ij_tot[iter] = unit_frac;
dc6_iji_tot[iter] = denominator_rc * fma(unit_frac, -d_denominator_i, d_numerator_i); // must be double
dc6_ijj_tot[iter] = denominator_rc * fma(unit_frac, -d_denominator_j, d_numerator_j); // must be double
//const double denominator_rc = 1.0 / denominator;
//const float unit_frac = numerator * denominator_rc;
//c6_ij_tot[iter] = unit_frac;
//dc6_iji_tot[iter] = \
static_cast<float>(d_numerator_i * denominator_rc) - static_cast<float>(d_denominator_i * denominator_rc) * unit_frac;
//dc6_ijj_tot[iter] = \
static_cast<float>(d_numerator_j * denominator_rc) - static_cast<float>(d_denominator_j * denominator_rc) * unit_frac;
}
else {
c6_ij_tot[iter] = c6mem;
dc6_iji_tot[iter] = 0.0f;
dc6_ijj_tot[iter] = 0.0f;
}
}
}
void PairD3::get_dC6_dCNij() {
int n = atom->natoms;
int maxij = n * (n + 1) / 2;
//START_CUDA_TIMER();
int threadsPerBlock = 128;
int blocksPerGrid = (maxij + threadsPerBlock - 1) / threadsPerBlock;
kernel_get_dC6_dCNij<<<blocksPerGrid, threadsPerBlock>>>(
maxij, K3,
cn, mxc, c6ab, atomtype,
c6_ij_tot, dc6_iji_tot, dc6_ijj_tot
);
cudaDeviceSynchronize();
//STOP_CUDA_TIMER("get_dC6dCNij");
}
/* ----------------------------------------------------------------------
Get lattice vectors (used in PairD3::compute)
1) Save lattice vectors into "lat_v_1", "lat_v_2", "lat_v_3"
2) Calculate repetition criteria for vdw, cn
3) precaluclate tau (xyz shift due to cell repetition)
------------------------------------------------------------------------- */
void PairD3::set_lattice_vectors() {
double boxxlo = domain->boxlo[0];
double boxxhi = domain->boxhi[0];
double boxylo = domain->boxlo[1];
double boxyhi = domain->boxhi[1];
double boxzlo = domain->boxlo[2];
double boxzhi = domain->boxhi[2];
double xy = domain->xy;
double xz = domain->xz;
double yz = domain->yz;
lat_v_1[0] = (boxxhi - boxxlo) / AU_TO_ANG;
lat_v_1[1] = 0.0;
lat_v_1[2] = 0.0;
lat_v_2[0] = xy / AU_TO_ANG;
lat_v_2[1] = (boxyhi - boxylo) / AU_TO_ANG;
lat_v_2[2] = 0.0;
lat_v_3[0] = xz / AU_TO_ANG;
lat_v_3[1] = yz / AU_TO_ANG;
lat_v_3[2] = (boxzhi - boxzlo) / AU_TO_ANG;
int vdwrx_save = 2 * rep_vdw[0] + 1;
int vdwry_save = 2 * rep_vdw[1] + 1;
int vdwrz_save = 2 * rep_vdw[2] + 1;
int cnrx_save = 2 * rep_cn[0] + 1;
int cnry_save = 2 * rep_cn[1] + 1;
int cnrz_save = 2 * rep_cn[2] + 1;
set_lattice_repetition_criteria(rthr, rep_vdw);
set_lattice_repetition_criteria(cnthr, rep_cn);
int vdw_range_x = 2 * rep_vdw[0] + 1;
int vdw_range_y = 2 * rep_vdw[1] + 1;
int vdw_range_z = 2 * rep_vdw[2] + 1;
int tau_loop_size_vdw = vdw_range_x * vdw_range_y * vdw_range_z * 3;
if (tau_loop_size_vdw != tau_idx_vdw_total_size) {
if (tau_idx_vdw != nullptr) {
for (int i = 0; i < vdwrx_save; i++) {
for (int j = 0; j < vdwry_save; j++) {
for (int k = 0; k < vdwrz_save; k++) {
cudaFree(tau_vdw[i][j][k]);
}
cudaFree(tau_vdw[i][j]);
}
cudaFree(tau_vdw[i]);
}
cudaFree(tau_vdw);
cudaFree(tau_idx_vdw);
}
tau_idx_vdw_total_size = tau_loop_size_vdw;
cudaMallocManaged(&tau_vdw, vdw_range_x * sizeof(float***));
for (int i = 0; i < vdw_range_x; i++) {
cudaMallocManaged(&tau_vdw[i], vdw_range_y * sizeof(float**));
for (int j = 0; j < vdw_range_y; j++) {
cudaMallocManaged(&tau_vdw[i][j], vdw_range_z * sizeof(float*));
for (int k = 0; k < vdw_range_z; k++) {
cudaMallocManaged(&tau_vdw[i][j][k], 3 * sizeof(float));
}
}
}
cudaMallocManaged(&tau_idx_vdw, tau_idx_vdw_total_size * sizeof(int));
}
int cn_range_x = 2 * rep_cn[0] + 1;
int cn_range_y = 2 * rep_cn[1] + 1;
int cn_range_z = 2 * rep_cn[2] + 1;
int tau_loop_size_cn = cn_range_x * cn_range_y * cn_range_z * 3;
if (tau_loop_size_cn != tau_idx_cn_total_size) {
if (tau_idx_cn != nullptr) {
for (int i = 0; i < cnrx_save; i++) {
for (int j = 0; j < cnry_save; j++) {
for (int k = 0; k < cnrz_save; k++) {
cudaFree(tau_cn[i][j][k]);
}
cudaFree(tau_cn[i][j]);
}
cudaFree(tau_cn[i]);
}
cudaFree(tau_cn);
cudaFree(tau_idx_cn);
}
tau_idx_cn_total_size = tau_loop_size_cn;
cudaMallocManaged(&tau_cn, cn_range_x * sizeof(float***));
for (int i = 0; i < cn_range_x; i++) {
cudaMallocManaged(&tau_cn[i], cn_range_y * sizeof(float**));
for (int j = 0; j < cn_range_y; j++) {
cudaMallocManaged(&tau_cn[i][j], cn_range_z * sizeof(float*));
for (int k = 0; k < cn_range_z; k++) {
cudaMallocManaged(&tau_cn[i][j][k], 3 * sizeof(float));
}
}
}
cudaMallocManaged(&tau_idx_cn, tau_idx_cn_total_size * sizeof(int));
}
}
/* ----------------------------------------------------------------------
Set repetition criteria (used in PairD3::compute)
Needed as Periodic Boundary Condition should be considered.
As the cell may *not* be orthorhombic,
the dot product should be used between x/y/z direction and
corresponding cross product vector.
------------------------------------------------------------------------- */
void PairD3::set_lattice_repetition_criteria(float r_threshold, int* rep_v) {
double r_cutoff = sqrt(r_threshold);
double lat_cp_12[3], lat_cp_23[3], lat_cp_31[3];
double cos_value;
cross3(lat_v_1, lat_v_2, lat_cp_12);
cross3(lat_v_2, lat_v_3, lat_cp_23);
cross3(lat_v_3, lat_v_1, lat_cp_31);
cos_value = dot3(lat_cp_23, lat_v_1) / len3(lat_cp_23);
rep_v[0] = static_cast<int>(std::abs(r_cutoff / cos_value)) + 1;
cos_value = dot3(lat_cp_31, lat_v_2) / len3(lat_cp_31);
rep_v[1] = static_cast<int>(std::abs(r_cutoff / cos_value)) + 1;
cos_value = dot3(lat_cp_12, lat_v_3) / len3(lat_cp_12);
rep_v[2] = static_cast<int>(std::abs(r_cutoff / cos_value)) + 1;
if (domain->xperiodic == 0) { rep_v[0] = 0; }
if (domain->yperiodic == 0) { rep_v[1] = 0; }
if (domain->zperiodic == 0) { rep_v[2] = 0; }
}
/* ----------------------------------------------------------------------
Calculate Coordination Number (used in PairD3::compute)
------------------------------------------------------------------------- */
__global__ void kernel_get_coordination_number(
int maxij, int maxtau, float cnthr, float K1,
float *rcov, int *rep_cn, float ****tau_cn, int *tau_idx_cn, int *type, float **x,
double *cn
) {
int iter = blockIdx.x * blockDim.x + threadIdx.x;
if (iter < maxij) {
int iat, jat;
ij_at_linij(iter, iat, jat);
float cn_local = 0.0f;
if (iat == jat) {
const float rcov_sum = rcov[type[iat]] * 2.0f;
for (int k = maxtau - 1; k >= 0; k -= 3) {
const int idx1 = tau_idx_cn[k-2];
const int idx2 = tau_idx_cn[k-1];
const int idx3 = tau_idx_cn[k];
if (idx1 == rep_cn[0] && idx2 == rep_cn[1] && idx3 == rep_cn[2]) { continue; }
const float rx = tau_cn[idx1][idx2][idx3][0];
const float ry = tau_cn[idx1][idx2][idx3][1];
const float rz = tau_cn[idx1][idx2][idx3][2];
const float r2 = rx * rx + ry * ry + rz * rz;
if (r2 <= cnthr) {
const float r_rc = rsqrtf(r2);
const float damp = 1.0f / (1.0f + expf(-K1 * ((rcov_sum * r_rc) - 1.0f)));
cn_local += damp;
}
}
atomicAdd(&cn[iat], cn_local);
}
else {
const float rcov_sum = rcov[type[iat]] + rcov[type[jat]];
for (int k = maxtau - 1; k >= 0; k -= 3) {
const int idx1 = tau_idx_cn[k-2];
const int idx2 = tau_idx_cn[k-1];
const int idx3 = tau_idx_cn[k];
const float rx = x[jat][0] - x[iat][0] + tau_cn[idx1][idx2][idx3][0];
const float ry = x[jat][1] - x[iat][1] + tau_cn[idx1][idx2][idx3][1];
const float rz = x[jat][2] - x[iat][2] + tau_cn[idx1][idx2][idx3][2];
const float r2 = rx * rx + ry * ry + rz * rz;
if (r2 <= cnthr) {
const float r_rc = rsqrtf(r2);
const float damp = 1.0f / (1.0f + expf(-K1 * ((rcov_sum * r_rc) - 1.0f)));
cn_local += damp;
}
}
atomicAdd(&cn[iat], cn_local);
atomicAdd(&cn[jat], cn_local);
}
}
}
void PairD3::get_coordination_number() {
int n = atom->natoms;
int maxij = n * (n + 1) / 2;
int maxtau = tau_idx_cn_total_size;
for (int i = 0; i < n; i++) {
cn[i] = 0.0;
}
//START_CUDA_TIMER();
int threadsPerBlock = 128;
int blocksPerGrid = (maxij + threadsPerBlock - 1) / threadsPerBlock;
kernel_get_coordination_number<<<blocksPerGrid, threadsPerBlock>>>(
maxij, maxtau, cnthr, K1,
rcov, rep_cn, tau_cn, tau_idx_cn, atomtype, x,
cn
);
cudaDeviceSynchronize();
//STOP_CUDA_TIMER("get_coord");
}
/* ----------------------------------------------------------------------
reallcate memory if the number of atoms has changed (used in PairD3::compute)
------------------------------------------------------------------------- */
void PairD3::reallocate_arrays() {
/* -------------- Destroy previous arrays -------------- */
cudaFree(cn);
for (int i = 0; i < n_save; i++) { cudaFree(x[i]); }; cudaFree(x);
cudaFree(dc6i);
for (int i = 0; i < n_save; i++) { cudaFree(f[i]); }; cudaFree(f);
cudaFree(dc6_iji_tot);
cudaFree(dc6_ijj_tot);
cudaFree(c6_ij_tot);
cudaFree(atomtype);
/* -------------- Destroy previous arrays -------------- */
/* -------------- Create new arrays -------------- */
int n = atom->natoms;
n_save = n;
cudaMallocManaged(&cn, n * sizeof(double));
cudaMallocManaged(&x, n * sizeof(float*)); for (int i = 0; i < n; i++) { cudaMallocManaged(&x[i], 3 * sizeof(float)); }
cudaMallocManaged(&dc6i, n * sizeof(double));
cudaMallocManaged(&f, n * sizeof(double*)); for (int i = 0; i < n; i++) { cudaMallocManaged(&f[i], 3 * sizeof(double)); }
int n_ij_combination = n * (n + 1) / 2;
cudaMallocManaged(&dc6_iji_tot, n_ij_combination * sizeof(float));
cudaMallocManaged(&dc6_ijj_tot, n_ij_combination * sizeof(float));
cudaMallocManaged(&c6_ij_tot, n_ij_combination * sizeof(float));
cudaMallocManaged(&atomtype, n * sizeof(int));
/* -------------- Create new arrays -------------- */
}
void PairD3::reallocate_arrays_np1() {
/* -------------- Destroy previous arrays -------------- */
cudaFree(r2r4);
cudaFree(rcov);
cudaFree(mxc);
for (int i = 0; i < np1_save; i++) { cudaFree(r0ab[i]); }; cudaFree(r0ab);
for (int i = 0; i < np1_save; i++) {
for (int j = 0; j < np1_save; j++) {
for (int k = 0; k < MAXC; k++) {
for (int l = 0; l < MAXC; l++) {
cudaFree(c6ab[i][j][k][l]);
}
cudaFree(c6ab[i][j][k]);
}
cudaFree(c6ab[i][j]);
}
cudaFree(c6ab[i]);
}
cudaFree(c6ab);
/* -------------- Destroy previous arrays -------------- */
/* -------------- Create new arrays -------------- */
int np1 = atom->ntypes + 1;
np1_save = np1;
cudaMallocManaged(&r2r4, np1 * sizeof(float));
cudaMallocManaged(&rcov, np1 * sizeof(float));
cudaMallocManaged(&mxc, np1 * sizeof(int));
cudaMallocManaged(&r0ab, np1 * sizeof(float*)); for (int i = 0; i < np1; i++) { cudaMallocManaged(&r0ab[i], np1 * sizeof(float)); }
cudaMallocManaged(&c6ab, np1 * sizeof(float****));
for (int i = 0; i < np1; i++) {
cudaMallocManaged(&c6ab[i], np1 * sizeof(float***));
for (int j = 0; j < np1; j++) {
cudaMallocManaged(&c6ab[i][j], MAXC * sizeof(float**));
for (int k = 0; k < MAXC; k++) {
cudaMallocManaged(&c6ab[i][j][k], MAXC * sizeof(float*));
for (int l = 0; l < MAXC; l++) {
cudaMallocManaged(&c6ab[i][j][k][l], 3 * sizeof(float));
}
}
}
}
/* -------------- Create new arrays -------------- */
}
/* ----------------------------------------------------------------------
Initialize atomic positions & types (used in PairD3::compute)
As the default xyz from lammps does not assure that atoms are within unit cell,
this function shifts atoms into the unit cell.
------------------------------------------------------------------------- */
void PairD3::load_atom_info() {
double lat[3][3];
lat[0][0] = lat_v_1[0];
lat[0][1] = lat_v_2[0];
lat[0][2] = lat_v_3[0];
lat[1][0] = lat_v_1[1];
lat[1][1] = lat_v_2[1];
lat[1][2] = lat_v_3[1];
lat[2][0] = lat_v_1[2];
lat[2][1] = lat_v_2[2];
lat[2][2] = lat_v_3[2];
double det = lat[0][0] * lat[1][1] * lat[2][2]
+ lat[0][1] * lat[1][2] * lat[2][0]
+ lat[0][2] * lat[1][0] * lat[2][1]
- lat[0][2] * lat[1][1] * lat[2][0]
- lat[0][1] * lat[1][0] * lat[2][2]
- lat[0][0] * lat[1][2] * lat[2][1];
double lat_inv[3][3];
lat_inv[0][0] = (lat[1][1] * lat[2][2] - lat[1][2] * lat[2][1]) / det;
lat_inv[1][0] = (lat[1][2] * lat[2][0] - lat[1][0] * lat[2][2]) / det;
lat_inv[2][0] = (lat[1][0] * lat[2][1] - lat[1][1] * lat[2][0]) / det;
lat_inv[0][1] = (lat[0][2] * lat[2][1] - lat[0][1] * lat[2][2]) / det;
lat_inv[1][1] = (lat[0][0] * lat[2][2] - lat[0][2] * lat[2][0]) / det;
lat_inv[2][1] = (lat[0][1] * lat[2][0] - lat[0][0] * lat[2][1]) / det;
lat_inv[0][2] = (lat[0][1] * lat[1][2] - lat[0][2] * lat[1][1]) / det;
lat_inv[1][2] = (lat[0][2] * lat[1][0] - lat[0][0] * lat[1][2]) / det;
lat_inv[2][2] = (lat[0][0] * lat[1][1] - lat[0][1] * lat[1][0]) / det;
double a[3] = { 0.0 };
for (int iat = 0; iat < atom->natoms; iat++) {
for (int i = 0; i < 3; i++) {
a[i] = lat_inv[i][0] * (atom->x)[iat][0] / AU_TO_ANG +
lat_inv[i][1] * (atom->x)[iat][1] / AU_TO_ANG +
lat_inv[i][2] * (atom->x)[iat][2] / AU_TO_ANG;
a[i] -= floor(a[i]); // replaces the code below
//if (a[i] > 1) { while (a[i] > 1) { a[i]--; } }
//else if (a[i] < 0) { while (a[i] < 0) { a[i]++; } }
}
for (int i = 0; i < 3; i++) {
x[iat][i] = (lat[i][0] * a[0] + lat[i][1] * a[1] + lat[i][2] * a[2]);
}
}
}
/* ----------------------------------------------------------------------
Precalculate tau array
------------------------------------------------------------------------- */
void PairD3::precalculate_tau_array() {
int xlim = rep_vdw[0];
int ylim = rep_vdw[1];
int zlim = rep_vdw[2];
int index = 0;
for (int taux = -xlim; taux <= xlim; taux++) {
for (int tauy = -ylim; tauy <= ylim; tauy++) {
for (int tauz = -zlim; tauz <= zlim; tauz++) {
tau_vdw[taux + xlim][tauy + ylim][tauz + zlim][0] = lat_v_1[0] * taux + lat_v_2[0] * tauy + lat_v_3[0] * tauz;
tau_vdw[taux + xlim][tauy + ylim][tauz + zlim][1] = lat_v_1[1] * taux + lat_v_2[1] * tauy + lat_v_3[1] * tauz;
tau_vdw[taux + xlim][tauy + ylim][tauz + zlim][2] = lat_v_1[2] * taux + lat_v_2[2] * tauy + lat_v_3[2] * tauz;
tau_idx_vdw[index++] = taux + xlim;
tau_idx_vdw[index++] = tauy + ylim;
tau_idx_vdw[index++] = tauz + zlim;
}
}
}
xlim = rep_cn[0];
ylim = rep_cn[1];
zlim = rep_cn[2];
index = 0;
for (int taux = -xlim; taux <= xlim; taux++) {
for (int tauy = -ylim; tauy <= ylim; tauy++) {
for (int tauz = -zlim; tauz <= zlim; tauz++) {
tau_cn[taux + xlim][tauy + ylim][tauz + zlim][0] = lat_v_1[0] * taux + lat_v_2[0] * tauy + lat_v_3[0] * tauz;
tau_cn[taux + xlim][tauy + ylim][tauz + zlim][1] = lat_v_1[1] * taux + lat_v_2[1] * tauy + lat_v_3[1] * tauz;
tau_cn[taux + xlim][tauy + ylim][tauz + zlim][2] = lat_v_1[2] * taux + lat_v_2[2] * tauy + lat_v_3[2] * tauz;
tau_idx_cn[index++] = taux + xlim;
tau_idx_cn[index++] = tauy + ylim;
tau_idx_cn[index++] = tauz + zlim;
}
}
}
}
/* ----------------------------------------------------------------------
Get forces (Zero damping)
------------------------------------------------------------------------- */
__global__ void kernel_get_forces_without_dC6_zero(
int maxij, int maxtau, float rthr, float s6, float s8, float a1, float a2, float alp6, float alp8,
float *r2r4, float **r0ab, int *rep_vdw, float ****tau_vdw, int *tau_idx_vdw, int *type, float **x,
float *c6_ij_tot, float *dc6_iji_tot, float *dc6_ijj_tot,
double *dc6i, double *disp, double **f, double **sigma
) {
int iter = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float sigma_00[128];
__shared__ float sigma_01[128];
__shared__ float sigma_02[128];
__shared__ float sigma_10[128];
__shared__ float sigma_11[128];
__shared__ float sigma_12[128];
__shared__ float sigma_20[128];
__shared__ float sigma_21[128];
__shared__ float sigma_22[128];
__shared__ float disp_shared[128];
float sigma_local_00 = 0.0f;
float sigma_local_01 = 0.0f;
float sigma_local_02 = 0.0f;
float sigma_local_10 = 0.0f;
float sigma_local_11 = 0.0f;
float sigma_local_12 = 0.0f;
float sigma_local_20 = 0.0f;
float sigma_local_21 = 0.0f;
float sigma_local_22 = 0.0f;
float disp_local = 0.0f;
if (iter < maxij) {
int iat, jat;
ij_at_linij(iter, iat, jat);
float f_local[3] = { 0.0f };
float dc6i_local_i = 0.0f;
float dc6i_local_j = 0.0f;
const float c6 = c6_ij_tot[iter];
const float dc6iji = dc6_iji_tot[iter];
const float dc6ijj = dc6_ijj_tot[iter];
if (iat == jat) {
const int atomtype_i = type[iat];
const float r0 = r0ab[atomtype_i][atomtype_i];
const float unit_r2r4 = r2r4[atomtype_i];
const float r42 = unit_r2r4 * unit_r2r4;
const float unit_a1 = (a1 * r0);
const float unit_a2 = (a2 * r0);
const float s8r42 = s8 * r42;
for (int k = maxtau - 1; k >= 0; k -= 3) {
const int idx1 = tau_idx_vdw[k-2];
const int idx2 = tau_idx_vdw[k-1];
const int idx3 = tau_idx_vdw[k];
if (idx1 == rep_vdw[0] && idx2 == rep_vdw[1] && idx3 == rep_vdw[2]) { continue; }
const float rij[3] = {
tau_vdw[idx1][idx2][idx3][0],
tau_vdw[idx1][idx2][idx3][1],
tau_vdw[idx1][idx2][idx3][2]
};
const float r2 = lensq3(rij);
if (r2 > rthr) { continue; }
const float r_rc = rsqrtf(r2);
float unit_rc_a1 = unit_a1 * r_rc;
float t6 = unit_rc_a1 * unit_rc_a1; // ^2
t6 *= unit_rc_a1; // ^3
t6 *= t6; // ^6
t6 *= unit_rc_a1; // ^7
t6 *= t6; // ^14
const float damp6 = 1.0f / fmaf(t6, 6.0f, 1.0f);
float unit_rc_a2 = unit_a2 * r_rc;
float t8 = unit_rc_a2 * unit_rc_a2; // ^2
t8 *= t8; // ^4
t8 *= t8; // ^8
t8 *= t8; // ^16
const float damp8 = 1.0f / fmaf(t8, 6.0f, 1.0f);
const float r2_rc = r_rc * r_rc; // 1.0 / r2
const float r6_rc = r2_rc * r2_rc * r2_rc;
const float r8_rc = r6_rc * r2_rc;
const float x1 = 3.0f * c6 * r8_rc * fmaf(r2_rc, s8r42 * damp8 * fmaf(3.0f * alp8 * t8, damp8, -4.0f), s6 * damp6 * fmaf(alp6 * t6, damp6, -1.0f));
//const float x1 = 0.5 * 6.0 * c6 * r8_rc * (s6 * damp6 * (14.0 * t6 * damp6 - 1.0) + s8r42 * r2_rc * damp8 * (48.0 * t8 * damp8 - 4.0));
//3.0 * alp6 = 48.0
const float vec[3] = {
x1 * rij[0],
x1 * rij[1],
x1 * rij[2]
};
sigma_local_00 += vec[0] * rij[0];
sigma_local_01 += vec[0] * rij[1];
sigma_local_02 += vec[0] * rij[2];
sigma_local_10 += vec[1] * rij[0];
sigma_local_11 += vec[1] * rij[1];
sigma_local_12 += vec[1] * rij[2];
sigma_local_20 += vec[2] * rij[0];
sigma_local_21 += vec[2] * rij[1];
sigma_local_22 += vec[2] * rij[2];
const float dc6_rest = 0.5f * r6_rc * fmaf(3.0f * r2_rc, s8r42 * damp8, s6 * damp6);
//const float dc6_rest = 0.5 * r6_rc * (s6 * damp6 + 3.0 * s8r42 * damp8 * r2_rc);
disp_local -= dc6_rest * c6;
dc6i_local_i += dc6_rest * dc6iji;
dc6i_local_j += dc6_rest * dc6ijj;
}
atomicAdd(&dc6i[iat], dc6i_local_i);
atomicAdd(&dc6i[jat], dc6i_local_j);
}
else {
const int atomtype_i = type[iat];
const int atomtype_j = type[jat];
const float r0 = r0ab[atomtype_i][atomtype_j];
const float r42 = r2r4[atomtype_i] * r2r4[atomtype_j];
const float unit_a1 = (a1 * r0);
const float unit_a2 = (a2 * r0);
const float s8r42 = s8 * r42;
for (int k = maxtau - 1; k >= 0; k -= 3) {
const int idx1 = tau_idx_vdw[k-2];
const int idx2 = tau_idx_vdw[k-1];
const int idx3 = tau_idx_vdw[k];
const float rij[3] = {
x[jat][0] - x[iat][0] + tau_vdw[idx1][idx2][idx3][0],
x[jat][1] - x[iat][1] + tau_vdw[idx1][idx2][idx3][1],
x[jat][2] - x[iat][2] + tau_vdw[idx1][idx2][idx3][2]
};
const float r2 = lensq3(rij);
if (r2 > rthr) { continue; }
const float r_rc = rsqrtf(r2);
float unit_rc_a1 = unit_a1 * r_rc;
float t6 = unit_rc_a1 * unit_rc_a1; // ^2
t6 *= unit_rc_a1; // ^3
t6 *= t6; // ^6
t6 *= unit_rc_a1; // ^7
t6 *= t6; // ^14
const float damp6 = 1.0f / fmaf(t6, 6.0f, 1.0f);
float unit_rc_a2 = unit_a2 * r_rc;
float t8 = unit_rc_a2 * unit_rc_a2; // ^2
t8 *= t8; // ^4
t8 *= t8; // ^8
t8 *= t8; // ^16
const float damp8 = 1.0f / fmaf(t8, 6.0f, 1.0f);
const float r2_rc = r_rc * r_rc; // 1.0 / r2
const float r6_rc = r2_rc * r2_rc * r2_rc;
const float r8_rc = r6_rc * r2_rc;
const float x1 = 6.0f * c6 * r8_rc * fmaf(r2_rc, s8r42 * damp8 * fmaf(3.0f * alp8 * t8, damp8, -4.0f), s6 * damp6 * fmaf(alp6 * t6, damp6, -1.0f));
//const float x1 = 6.0 * c6 * r8_rc * (s6 * damp6 * (14.0 * t6 * damp6 - 1.0) + s8r42 * r2_rc * damp8 * (48.0 * t8 * damp8 - 4.0));
//3.0 * alp6 = 48.0
const float vec[3] = {
x1 * rij[0],
x1 * rij[1],
x1 * rij[2]
};
f_local[0] -= vec[0];
f_local[1] -= vec[1];
f_local[2] -= vec[2];
sigma_local_00 += vec[0] * rij[0];
sigma_local_01 += vec[0] * rij[1];
sigma_local_02 += vec[0] * rij[2];
sigma_local_10 += vec[1] * rij[0];
sigma_local_11 += vec[1] * rij[1];
sigma_local_12 += vec[1] * rij[2];
sigma_local_20 += vec[2] * rij[0];
sigma_local_21 += vec[2] * rij[1];
sigma_local_22 += vec[2] * rij[2];
const float dc6_rest = r6_rc * fmaf(3.0f * r2_rc, s8r42 * damp8, s6 * damp6);
//const float dc6_rest = r6_rc * (s6 * damp6 + 3.0 * s8r42 * damp8 * r2_rc);
disp_local -= dc6_rest * c6;
dc6i_local_i += dc6_rest * dc6iji;
dc6i_local_j += dc6_rest * dc6ijj;
}
atomicAdd(&dc6i[iat], dc6i_local_i);
atomicAdd(&dc6i[jat], dc6i_local_j);
atomicAdd(&f[iat][0], f_local[0]);
atomicAdd(&f[iat][1], f_local[1]);
atomicAdd(&f[iat][2], f_local[2]);
atomicAdd(&f[jat][0], -f_local[0]);
atomicAdd(&f[jat][1], -f_local[1]);
atomicAdd(&f[jat][2], -f_local[2]);
}
}
sigma_00[threadIdx.x] = sigma_local_00;
sigma_01[threadIdx.x] = sigma_local_01;
sigma_02[threadIdx.x] = sigma_local_02;
sigma_10[threadIdx.x] = sigma_local_10;
sigma_11[threadIdx.x] = sigma_local_11;
sigma_12[threadIdx.x] = sigma_local_12;
sigma_20[threadIdx.x] = sigma_local_20;
sigma_21[threadIdx.x] = sigma_local_21;
sigma_22[threadIdx.x] = sigma_local_22;
disp_shared[threadIdx.x] = disp_local;
__syncthreads();
for (int s=blockDim.x/2; s>0; s>>=1) {
if (threadIdx.x < s) {
sigma_00[threadIdx.x] += sigma_00[threadIdx.x + s];
sigma_01[threadIdx.x] += sigma_01[threadIdx.x + s];
sigma_02[threadIdx.x] += sigma_02[threadIdx.x + s];
sigma_10[threadIdx.x] += sigma_10[threadIdx.x + s];
sigma_11[threadIdx.x] += sigma_11[threadIdx.x + s];
sigma_12[threadIdx.x] += sigma_12[threadIdx.x + s];
sigma_20[threadIdx.x] += sigma_20[threadIdx.x + s];
sigma_21[threadIdx.x] += sigma_21[threadIdx.x + s];
sigma_22[threadIdx.x] += sigma_22[threadIdx.x + s];
disp_shared[threadIdx.x] += disp_shared[threadIdx.x + s];
}
__syncthreads();
}
if (threadIdx.x == 0) {
atomicAdd(&sigma[0][0], sigma_00[0]);
atomicAdd(&sigma[0][1], sigma_01[0]);
atomicAdd(&sigma[0][2], sigma_02[0]);
atomicAdd(&sigma[1][0], sigma_10[0]);
atomicAdd(&sigma[1][1], sigma_11[0]);
atomicAdd(&sigma[1][2], sigma_12[0]);
atomicAdd(&sigma[2][0], sigma_20[0]);
atomicAdd(&sigma[2][1], sigma_21[0]);
atomicAdd(&sigma[2][2], sigma_22[0]);
atomicAdd(disp, disp_shared[0]);
}
}
void PairD3::get_forces_without_dC6_zero() {
int n = atom->natoms;
int maxij = n * (n + 1) / 2;
int maxtau = tau_idx_vdw_total_size;
*disp = 0.0;
for (int dim = 0; dim < n; dim++) { dc6i[dim] = 0.0; }
for (int i = 0; i < n; i++) {
for (int j = 0; j < 3; j++) {
f[i][j] = 0.0;
}
}
for (int ii = 0; ii < 3; ii++) {
for (int jj = 0; jj < 3; jj++) {
sigma[ii][jj] = 0.0;
}
}
//START_CUDA_TIMER();
int threadsPerBlock = 128;
int blocksPerGrid = (maxij + threadsPerBlock - 1) / threadsPerBlock;
kernel_get_forces_without_dC6_zero<<<blocksPerGrid, threadsPerBlock>>>(
maxij, maxtau, rthr, s6, s8, a1, a2, alp6, alp8,
r2r4, r0ab, rep_vdw, tau_vdw, tau_idx_vdw, atomtype, x,
c6_ij_tot, dc6_iji_tot, dc6_ijj_tot,
dc6i, disp, f, sigma
);
cudaDeviceSynchronize();
disp_total = *disp;
//STOP_CUDA_TIMER("get_forces_without");
}
__global__ void kernel_get_forces_without_dC6_bj(
int maxij, int maxtau, float rthr, float s6, float s8, float a1, float a2,
float *r2r4, int *rep_vdw, float ****tau_vdw, int *tau_idx_vdw, int *type, float **x,
float *c6_ij_tot, float *dc6_iji_tot, float *dc6_ijj_tot,
double *dc6i, double *disp, double **f, double **sigma
) {
int iter = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float sigma_00[128];
__shared__ float sigma_01[128];
__shared__ float sigma_02[128];
__shared__ float sigma_10[128];
__shared__ float sigma_11[128];
__shared__ float sigma_12[128];
__shared__ float sigma_20[128];
__shared__ float sigma_21[128];
__shared__ float sigma_22[128];
__shared__ float disp_shared[128];
float sigma_local_00 = 0.0f;
float sigma_local_01 = 0.0f;
float sigma_local_02 = 0.0f;
float sigma_local_10 = 0.0f;
float sigma_local_11 = 0.0f;
float sigma_local_12 = 0.0f;
float sigma_local_20 = 0.0f;
float sigma_local_21 = 0.0f;
float sigma_local_22 = 0.0f;
float disp_local = 0.0f;
if (iter < maxij) {
int iat, jat;
ij_at_linij(iter, iat, jat);
float f_local[3] = { 0.0f };
float dc6i_local_i = 0.0f;
float dc6i_local_j = 0.0f;
const float c6 = c6_ij_tot[iter];
const float dc6iji = dc6_iji_tot[iter];
const float dc6ijj = dc6_ijj_tot[iter];
if (iat == jat) {
const float unit_r2r4 = r2r4[type[iat]];
const float r42x3 = unit_r2r4 * unit_r2r4 * 3.0f;
const float R0 = fmaf(a1, sqrtf(r42x3), a2);
const float R0_2 = R0 * R0;
const float R0_6 = R0_2 * R0_2 * R0_2;
const float R0_8 = R0_6 * R0_2;
const float s8r42x3 = s8 * r42x3;
for (int k = maxtau - 1; k >= 0; k -= 3) {
const int idx1 = tau_idx_vdw[k-2];
const int idx2 = tau_idx_vdw[k-1];
const int idx3 = tau_idx_vdw[k];
if (idx1 == rep_vdw[0] && idx2 == rep_vdw[1] && idx3 == rep_vdw[2]) { continue; }
const float rij[3] = {
tau_vdw[idx1][idx2][idx3][0],
tau_vdw[idx1][idx2][idx3][1],
tau_vdw[idx1][idx2][idx3][2]
};
const float r2 = lensq3(rij);
if (r2 > rthr) { continue; }
const float r = sqrtf(r2);
const float r5 = r2 * r2 * r;
const float r7 = r5 * r2;
const float t6_rc = 1.0f / fmaf(r5, r, R0_6);
const float t8_rc = 1.0f / fmaf(r7, r, R0_8);
const float t6_sqrc = t6_rc * t6_rc;
const float t8_sqrc = t8_rc * t8_rc;
const float x1 = -c6 * fmaf(4.0f * s8r42x3 * r7, t8_sqrc, 3.0f * s6 * r5 * t6_sqrc);
//const float x1 = 0.5 * -c6 * (6.0 * s6 * r5 * t6_sqrc + 8.0 * s8r42x3 * r7 * t8_sqrc;
const float r_rc = 1.0f / r; // rsqrt(r2)
const float vec[3] = {
x1 * rij[0] * r_rc,
x1 * rij[1] * r_rc,
x1 * rij[2] * r_rc
};
sigma_local_00 += vec[0] * rij[0];
sigma_local_01 += vec[0] * rij[1];
sigma_local_02 += vec[0] * rij[2];
sigma_local_10 += vec[1] * rij[0];
sigma_local_11 += vec[1] * rij[1];
sigma_local_12 += vec[1] * rij[2];
sigma_local_20 += vec[2] * rij[0];
sigma_local_21 += vec[2] * rij[1];
sigma_local_22 += vec[2] * rij[2];
const float dc6_rest = 0.5f * fmaf(s8r42x3, t8_rc, s6 * t6_rc);
//const float dc6_rest = 0.5 * s6 * t6_rc + s8r42x3 * t8_rc;
disp_local -= dc6_rest * c6;
dc6i_local_i += dc6_rest * dc6iji;
dc6i_local_j += dc6_rest * dc6ijj;
}
atomicAdd(&dc6i[iat], dc6i_local_i);
atomicAdd(&dc6i[jat], dc6i_local_j);
}
else {
const float r42x3 = r2r4[type[iat]] * r2r4[type[jat]] * 3.0f;
const float R0 = fmaf(a1, sqrtf(r42x3), a2);
const float R0_2 = R0 * R0;
const float R0_6 = R0_2 * R0_2 * R0_2;
const float R0_8 = R0_6 * R0_2;
const float s8r42x3 = s8 * r42x3;
for (int k = maxtau - 1; k >= 0; k -= 3) {
const int idx1 = tau_idx_vdw[k-2];
const int idx2 = tau_idx_vdw[k-1];
const int idx3 = tau_idx_vdw[k];
const float rij[3] = {
x[jat][0] - x[iat][0] + tau_vdw[idx1][idx2][idx3][0],
x[jat][1] - x[iat][1] + tau_vdw[idx1][idx2][idx3][1],
x[jat][2] - x[iat][2] + tau_vdw[idx1][idx2][idx3][2]
};
const float r2 = lensq3(rij);
if (r2 > rthr) { continue; }
const float r = sqrtf(r2);
const float r5 = r2 * r2 * r;
const float r7 = r5 * r2;
const float t6_rc = 1.0f / fmaf(r5, r, R0_6);
const float t8_rc = 1.0f / fmaf(r7, r, R0_8);
const float t6_sqrc = t6_rc * t6_rc;
const float t8_sqrc = t8_rc * t8_rc;
const float x1 = -c6 * fmaf(8.0f * s8r42x3 * r7, t8_sqrc, 6.0f * s6 * r5 * t6_sqrc);
//const float x1 = -c6 * (6.0 * s6 * r5 * t6_sqrc + 8.0 * s8r42x3 * r7 * t8_sqrc;
const float r_rc = 1.0f / r; // rsqrt(r2)
const float vec[3] = {
x1 * rij[0] * r_rc,
x1 * rij[1] * r_rc,
x1 * rij[2] * r_rc
};
f_local[0] -= vec[0];
f_local[1] -= vec[1];
f_local[2] -= vec[2];
sigma_local_00 += vec[0] * rij[0];
sigma_local_01 += vec[0] * rij[1];
sigma_local_02 += vec[0] * rij[2];
sigma_local_10 += vec[1] * rij[0];
sigma_local_11 += vec[1] * rij[1];
sigma_local_12 += vec[1] * rij[2];
sigma_local_20 += vec[2] * rij[0];
sigma_local_21 += vec[2] * rij[1];
sigma_local_22 += vec[2] * rij[2];
const float dc6_rest = fmaf(s8r42x3, t8_rc, s6 * t6_rc);
//const float dc6_rest = s6 * t6_rc + s8r42x3 * t8_rc;
disp_local -= dc6_rest * c6;
dc6i_local_i += dc6_rest * dc6iji;
dc6i_local_j += dc6_rest * dc6ijj;
}
atomicAdd(&dc6i[iat], dc6i_local_i);
atomicAdd(&dc6i[jat], dc6i_local_j);
atomicAdd(&f[iat][0], f_local[0]);
atomicAdd(&f[iat][1], f_local[1]);
atomicAdd(&f[iat][2], f_local[2]);
atomicAdd(&f[jat][0], -f_local[0]);
atomicAdd(&f[jat][1], -f_local[1]);
atomicAdd(&f[jat][2], -f_local[2]);
}
}
sigma_00[threadIdx.x] = sigma_local_00;
sigma_01[threadIdx.x] = sigma_local_01;
sigma_02[threadIdx.x] = sigma_local_02;
sigma_10[threadIdx.x] = sigma_local_10;
sigma_11[threadIdx.x] = sigma_local_11;
sigma_12[threadIdx.x] = sigma_local_12;
sigma_20[threadIdx.x] = sigma_local_20;
sigma_21[threadIdx.x] = sigma_local_21;
sigma_22[threadIdx.x] = sigma_local_22;
disp_shared[threadIdx.x] = disp_local;
__syncthreads();
for (int s=blockDim.x/2; s>0; s>>=1) {
if (threadIdx.x < s) {
sigma_00[threadIdx.x] += sigma_00[threadIdx.x + s];
sigma_01[threadIdx.x] += sigma_01[threadIdx.x + s];
sigma_02[threadIdx.x] += sigma_02[threadIdx.x + s];
sigma_10[threadIdx.x] += sigma_10[threadIdx.x + s];
sigma_11[threadIdx.x] += sigma_11[threadIdx.x + s];
sigma_12[threadIdx.x] += sigma_12[threadIdx.x + s];
sigma_20[threadIdx.x] += sigma_20[threadIdx.x + s];
sigma_21[threadIdx.x] += sigma_21[threadIdx.x + s];
sigma_22[threadIdx.x] += sigma_22[threadIdx.x + s];
disp_shared[threadIdx.x] += disp_shared[threadIdx.x + s];
}
__syncthreads();
}
if (threadIdx.x == 0) {
atomicAdd(&sigma[0][0], sigma_00[0]);
atomicAdd(&sigma[0][1], sigma_01[0]);
atomicAdd(&sigma[0][2], sigma_02[0]);
atomicAdd(&sigma[1][0], sigma_10[0]);
atomicAdd(&sigma[1][1], sigma_11[0]);
atomicAdd(&sigma[1][2], sigma_12[0]);
atomicAdd(&sigma[2][0], sigma_20[0]);
atomicAdd(&sigma[2][1], sigma_21[0]);
atomicAdd(&sigma[2][2], sigma_22[0]);
atomicAdd(disp, disp_shared[0]);
}
}
void PairD3::get_forces_without_dC6_bj() {
int n = atom->natoms;
int maxij = n * (n + 1) / 2;
int maxtau = tau_idx_vdw_total_size;
*disp = 0.0;
for (int dim = 0; dim < n; dim++) { dc6i[dim] = 0.0; }
for (int i = 0; i < n; i++) {
for (int j = 0; j < 3; j++) {
f[i][j] = 0.0;
}
}
for (int ii = 0; ii < 3; ii++) {
for (int jj = 0; jj < 3; jj++) {
sigma[ii][jj] = 0.0;
}
}
//START_CUDA_TIMER();
int threadsPerBlock = 128;
int blocksPerGrid = (maxij + threadsPerBlock - 1) / threadsPerBlock;
kernel_get_forces_without_dC6_bj<<<blocksPerGrid, threadsPerBlock>>>(
maxij, maxtau, rthr, s6, s8, a1, a2,
r2r4, rep_vdw, tau_vdw, tau_idx_vdw, atomtype, x,
c6_ij_tot, dc6_iji_tot, dc6_ijj_tot,
dc6i, disp, f, sigma
);
cudaDeviceSynchronize();
disp_total = *disp;
//STOP_CUDA_TIMER("get_forces_without");
}
void PairD3::get_forces_without_dC6_zerom() {}
void PairD3::get_forces_without_dC6_bjm() {}
void PairD3::get_forces_without_dC6() {
void (PairD3::*get_forces_without_dC6_damp[4])() = {
&PairD3::get_forces_without_dC6_zero,
&PairD3::get_forces_without_dC6_bj,
&PairD3::get_forces_without_dC6_zerom,
&PairD3::get_forces_without_dC6_bjm
};
(this->*get_forces_without_dC6_damp[damping])();
}
__global__ void kernel_get_forces_with_dC6(
int maxij, int maxtau, float cnthr, float K1,
double *dc6i, float *rcov, int *rep_cn, float ****tau_cn, int *tau_idx_cn, int *type, float **x,
double **f, double **sigma
) {
int iter = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float sigma_00[128];
__shared__ float sigma_01[128];
__shared__ float sigma_02[128];
__shared__ float sigma_10[128];
__shared__ float sigma_11[128];
__shared__ float sigma_12[128];
__shared__ float sigma_20[128];
__shared__ float sigma_21[128];
__shared__ float sigma_22[128];
float sigma_local_00 = 0.0f;
float sigma_local_01 = 0.0f;
float sigma_local_02 = 0.0f;
float sigma_local_10 = 0.0f;
float sigma_local_11 = 0.0f;
float sigma_local_12 = 0.0f;
float sigma_local_20 = 0.0f;
float sigma_local_21 = 0.0f;
float sigma_local_22 = 0.0f;
float f_local[3] = { 0.0f };
if (iter < maxij) {
int iat, jat;
ij_at_linij(iter, iat, jat);
if (iat == jat) {
const float rcov_sum = rcov[type[iat]] * 2.0f;
const float dc6i_sum = dc6i[iat];
for (int k = maxtau - 1; k >= 0; k -= 3) {
const int idx1 = tau_idx_cn[k-2];
const int idx2 = tau_idx_cn[k-1];
const int idx3 = tau_idx_cn[k];
if (idx1 == rep_cn[0] && idx2 == rep_cn[1] && idx3 == rep_cn[2]) { continue; }
const float rij[3] = {
tau_cn[idx1][idx2][idx3][0],
tau_cn[idx1][idx2][idx3][1],
tau_cn[idx1][idx2][idx3][2],
};
const float r2 = lensq3(rij);
if (r2 >= cnthr) { continue; }
const float r_rc = rsqrtf(r2);
const float expterm = expf(-K1 * (rcov_sum * r_rc - 1.0f));
const float unit_rc = 1.0f / (r2 * (expterm + 1.0f) * (expterm + 1.0f));
const float dcnn = -K1 * rcov_sum * expterm * unit_rc;
const float x1 = dcnn * dc6i_sum;
const float vec[3] = {
x1 * rij[0] * r_rc,
x1 * rij[1] * r_rc,
x1 * rij[2] * r_rc
};
sigma_local_00 += vec[0] * rij[0];
sigma_local_01 += vec[0] * rij[1];
sigma_local_02 += vec[0] * rij[2];
sigma_local_10 += vec[1] * rij[0];
sigma_local_11 += vec[1] * rij[1];
sigma_local_12 += vec[1] * rij[2];
sigma_local_20 += vec[2] * rij[0];
sigma_local_21 += vec[2] * rij[1];
sigma_local_22 += vec[2] * rij[2];
}
}
else {
const float rcov_sum = rcov[type[iat]] + rcov[type[jat]];
const float dc6i_sum = dc6i[iat] + dc6i[jat];
for (int k = maxtau - 1; k >= 0; k -= 3) {
const int idx1 = tau_idx_cn[k-2];
const int idx2 = tau_idx_cn[k-1];
const int idx3 = tau_idx_cn[k];
const float rij[3] = {
x[jat][0] - x[iat][0] + tau_cn[idx1][idx2][idx3][0],
x[jat][1] - x[iat][1] + tau_cn[idx1][idx2][idx3][1],
x[jat][2] - x[iat][2] + tau_cn[idx1][idx2][idx3][2]
};
const float r2 = lensq3(rij);
if (r2 >= cnthr) { continue; }
const float r_rc = rsqrtf(r2);
const float expterm = expf(-K1 * (rcov_sum * r_rc - 1.0f));
const float unit_rc = 1.0f / (r2 * (expterm + 1.0f) * (expterm + 1.0f));
const float dcnn = -K1 * rcov_sum * expterm * unit_rc;
const float x1 = dcnn * dc6i_sum;
const float vec[3] = {
x1 * rij[0] * r_rc,
x1 * rij[1] * r_rc,
x1 * rij[2] * r_rc
};
f_local[0] -= vec[0];
f_local[1] -= vec[1];
f_local[2] -= vec[2];
sigma_local_00 += vec[0] * rij[0];
sigma_local_01 += vec[0] * rij[1];
sigma_local_02 += vec[0] * rij[2];
sigma_local_10 += vec[1] * rij[0];
sigma_local_11 += vec[1] * rij[1];
sigma_local_12 += vec[1] * rij[2];
sigma_local_20 += vec[2] * rij[0];
sigma_local_21 += vec[2] * rij[1];
sigma_local_22 += vec[2] * rij[2];
}
atomicAdd(&f[iat][0], f_local[0]);
atomicAdd(&f[iat][1], f_local[1]);
atomicAdd(&f[iat][2], f_local[2]);
atomicAdd(&f[jat][0], -f_local[0]);
atomicAdd(&f[jat][1], -f_local[1]);
atomicAdd(&f[jat][2], -f_local[2]);
}
}
sigma_00[threadIdx.x] = sigma_local_00;
sigma_01[threadIdx.x] = sigma_local_01;
sigma_02[threadIdx.x] = sigma_local_02;
sigma_10[threadIdx.x] = sigma_local_10;
sigma_11[threadIdx.x] = sigma_local_11;
sigma_12[threadIdx.x] = sigma_local_12;
sigma_20[threadIdx.x] = sigma_local_20;
sigma_21[threadIdx.x] = sigma_local_21;
sigma_22[threadIdx.x] = sigma_local_22;
__syncthreads();
for (int s=blockDim.x/2; s>0; s>>=1) {
if (threadIdx.x < s) {
sigma_00[threadIdx.x] += sigma_00[threadIdx.x + s];
sigma_01[threadIdx.x] += sigma_01[threadIdx.x + s];
sigma_02[threadIdx.x] += sigma_02[threadIdx.x + s];
sigma_10[threadIdx.x] += sigma_10[threadIdx.x + s];
sigma_11[threadIdx.x] += sigma_11[threadIdx.x + s];
sigma_12[threadIdx.x] += sigma_12[threadIdx.x + s];
sigma_20[threadIdx.x] += sigma_20[threadIdx.x + s];
sigma_21[threadIdx.x] += sigma_21[threadIdx.x + s];
sigma_22[threadIdx.x] += sigma_22[threadIdx.x + s];
}
__syncthreads();
}
if (threadIdx.x == 0) {
atomicAdd(&sigma[0][0], sigma_00[0]);
atomicAdd(&sigma[0][1], sigma_01[0]);
atomicAdd(&sigma[0][2], sigma_02[0]);
atomicAdd(&sigma[1][0], sigma_10[0]);
atomicAdd(&sigma[1][1], sigma_11[0]);
atomicAdd(&sigma[1][2], sigma_12[0]);
atomicAdd(&sigma[2][0], sigma_20[0]);
atomicAdd(&sigma[2][1], sigma_21[0]);
atomicAdd(&sigma[2][2], sigma_22[0]);
}
}
void PairD3::get_forces_with_dC6() {
int n = atom->natoms;
int maxij = n * (n + 1) / 2;
int maxtau = tau_idx_cn_total_size;
//START_CUDA_TIMER();
int threadsPerBlock = 128;
int blocksPerGrid = (maxij + threadsPerBlock - 1) / threadsPerBlock;
kernel_get_forces_with_dC6<<<blocksPerGrid, threadsPerBlock>>>(
maxij, maxtau, cnthr, K1,
dc6i, rcov, rep_cn, tau_cn, tau_idx_cn, atomtype, x,
f, sigma
);
cudaDeviceSynchronize();
//STOP_CUDA_TIMER("get_forces_with");
}
/* ----------------------------------------------------------------------
Update energy, force, and stress
------------------------------------------------------------------------- */
void PairD3::update() {
int n = atom->natoms;
// unit: eV <- eng_vdwl
result_E = disp_total * AU_TO_EV;
// unit: eV/Å, flatten for wrapper <- atom->f (f_local)
for (int i = 0; i < n; i++) {
for (int j = 0; j < 3; j++) {
result_F[i * 3 + j] = f[i][j] * AU_TO_EV / AU_TO_ANG;
}
}
// unit: eV, virial, xx, yy, zz, xy, xz, yz <- virial
result_S[0] = sigma[0][0] * AU_TO_EV;
result_S[1] = sigma[1][1] * AU_TO_EV;
result_S[2] = sigma[2][2] * AU_TO_EV;
result_S[3] = sigma[0][1] * AU_TO_EV;
result_S[4] = sigma[0][2] * AU_TO_EV;
result_S[5] = sigma[1][2] * AU_TO_EV;
}
/* ----------------------------------------------------------------------
Compute : energy, force, and stress (Required)
------------------------------------------------------------------------- */
void PairD3::compute() {
if (atom->natoms != n_save) { reallocate_arrays(); }
set_lattice_vectors();
precalculate_tau_array();
load_atom_info();
cudaMemcpy(atomtype, atom->type, atom->natoms * sizeof(int), cudaMemcpyHostToDevice);
get_coordination_number();
get_dC6_dCNij();
get_forces_without_dC6();
get_forces_with_dC6();
update();
CHECK_CUDA_ERROR();
}
int main() {
}
extern "C" { // C wrapper for ctypes or cffi
PairD3* pair_init() {
return new PairD3();
}
void pair_set_atom(PairD3* pair, int natoms, int ntypes, int* type, double* x_flat) {
double** x = new double*[natoms];
for (int i = 0; i < natoms; i++) {
x[i] = x_flat + i * 3;
}
pair->atom = new Atom(natoms, ntypes, type, x);
pair->result_F = new double[natoms * 3];
}
void pair_set_domain(PairD3* pair, int xperiodic, int yperiodic, int zperiodic, double* boxlo, double* boxhi, double xy, double xz, double yz) {
pair->domain = new Domain(xperiodic, yperiodic, zperiodic, boxlo, boxhi, xy, xz, yz);
}
void pair_run_settings(PairD3* pair, double rthr, double cnthr, const char* damp_name, const char* func_name) {
pair->settings(rthr, cnthr, damp_name, func_name);
}
void pair_run_coeff(PairD3* pair, int* atomic_numbers) {
pair->coeff(atomic_numbers);
}
void pair_run_compute(PairD3* pair) {
pair->compute();
}
double pair_get_energy(PairD3* pair) {
return pair->result_E;
}
double* pair_get_force(PairD3* pair) {
return pair->result_F;
}
double* pair_get_stress(PairD3* pair) {
return pair->result_S;
}
void pair_fin(PairD3* pair) {
//delete[] result_F;
delete pair;
//delete domain;
//delete atom;
}
}
/*
This code is a skeleton of the LAMMPS pair_style d3 accelerated by CUDA.
All dependencies on LAMMPS have been removed.
The input and output variables are named based on the LAMMPS variables.
*/
#ifndef LMP_PAIR_D3
#define LMP_PAIR_D3
#include <cmath>
#include <string>
#include <vector> // for 'element_table'
#include <algorithm> // for 'atomic_number'
#include <map>
#include <unordered_map>
#include <cuda_runtime.h>
#include "pair_d3_pars.h"
// Removed dependencies to STL
// #include <stdlib.h> -> no more C style functions
// #define _USE_MATH_DEFINES -> no predefined constants
// Removed dependencies to LAMMPS
// #include "pair.h" -> removed, for construncting pair class.
// #include "utils.h" -> removed, some float converters.
// #include "atom.h" -> Atom class to replace it.
// #include "domain.h" -> Domain class to replace it.
// #include "error.h" -> Error class to replace it.
// #include "comm.h" -> already no dependency
// #include "neighbor.h" -> already no dependency
// #include "neigh_list.h" -> already no dependency
// #include "memory.h" -> already no dependency for CUDA version
// #include "math_extra.h" -> removed, dot and len3 operations.
// #include "potential_file_reader.h" -> removed, PotentialFileReader
/* --------- Fake class to replace 'LAMMPS' class --------- */
class Atom {
public:
int natoms;
int ntypes;
int* type;
double** x;
Atom(int natoms, int ntypes, int* type, double** x) :
natoms(natoms),
ntypes(ntypes),
type(type),
x(x) {}
~Atom() {
//delete[] type;
//for (int i = 0; i < natoms; i++) {
// delete[] x[i];
//}
//delete[] x;
}
};
class Domain {
public:
int xperiodic, yperiodic, zperiodic;
double boxlo[3], boxhi[3];
double xy, xz, yz;
Domain(int xperiodic, int yperiodic, int zperiodic, double* boxlo, double* boxhi, double xy, double xz, double yz) :
xperiodic(xperiodic),
yperiodic(yperiodic),
zperiodic(zperiodic),
xy(xy),
xz(xz),
yz(yz) {
for (int i = 0; i < 3; i++) {
this->boxlo[i] = boxlo[i];
this->boxhi[i] = boxhi[i];
}
}
~Domain() {
}
};
class Error {
public:
void all(int flerr, const char* message) {
printf("Error: %s\n", message);
}
Error() {}
~Error() {}
};
/* ------------------------------------------------------- */
/* --------- Declaration of fake classes and variables --------- */
#define FLERR 1
//Error* error = nullptr;
//
//int allocated;
//int** setflag;
//double** cutsq;
//Atom* atom = nullptr;
//Domain* domain = nullptr;
//
//double result_E;
//double* result_F = nullptr;
//double result_S[6];
class Pair {
public:
int allocated;
Atom* atom;
Domain* domain;
double result_E;
double* result_F;
double result_S[6];
Error* error;
Pair()
: allocated(0), atom(nullptr), domain(nullptr), result_E(0.0), result_F(nullptr), error(nullptr) {
std::fill(std::begin(result_S), std::end(result_S), 0.0);
}
virtual ~Pair() {
if (result_F) {
delete[] result_F;
result_F = nullptr;
}
if (atom) {
delete atom;
atom = nullptr;
}
if (domain) {
delete domain;
domain = nullptr;
}
if (error) {
delete error;
error = nullptr;
}
}
};
/* -------------------------------------------------------------- */
class PairD3 : public Pair {
public:
PairD3();
~PairD3();
void settings(double rthr, double cnthr, std::string damp_name, std::string func_name);
void coeff(int* atomic_number);
void compute();
protected:
virtual void allocate();
/* ------- Read parameters ------- */
int find_atomic_number(std::string&);
int is_int_in_array(int*, int, int);
void read_r0ab(int*, int); // void read_r0ab(class LAMMPS*, char*, int*, int);
void get_limit_in_pars_array(int&, int&, int&, int&);
void read_c6ab(int*, int); // void read_c6ab(class LAMMPS*, char*, int*, int);
void setfuncpar_zero();
void setfuncpar_bj();
void setfuncpar_zerom();
void setfuncpar_bjm();
void setfuncpar();
/* ------- Read parameters ------- */
/* ------- Lattice information ------- */
void set_lattice_repetition_criteria(float, int*);
void set_lattice_vectors();
/* ------- Lattice information ------- */
/* ------- Initialize & Precalculate ------- */
void load_atom_info();
void precalculate_tau_array();
/* ------- Initialize & Precalculate ------- */
/* ------- Reallocate (when number of atoms changed) ------- */
void reallocate_arrays();
void reallocate_arrays_np1();
/* ------- Reallocate (when number of atoms changed) ------- */
/* ------- Coordination number ------- */
void get_coordination_number();
void get_dC6_dCNij();
/* ------- Coordination number ------- */
/* ------- Main workers ------- */
void get_forces_without_dC6_zero();
void get_forces_without_dC6_bj();
void get_forces_without_dC6_zerom();
void get_forces_without_dC6_bjm();
void get_forces_without_dC6();
void get_forces_with_dC6();
void update();
/* ------- Main workers ------- */
/*--------- Constants ---------*/
static constexpr int MAX_ELEM = 94; // maximum of the element number
static constexpr int MAXC = 5; // maximum coordination number references per element
static constexpr double AU_TO_ANG = 0.52917726; // conversion factors (atomic unit --> angstrom)
static constexpr double AU_TO_EV = 27.21138505; // conversion factors (atomic unit --> eV)
static constexpr float K1 = 16.0; // global ad hoc parameters
static constexpr float K3 = -4.0; // global ad hoc parameters
/*--------- Constants ---------*/
/*--------- Parameters to read ---------*/
int damping;
std::string functional;
float* r2r4 = nullptr; // scale r4/r2 values of the atoms by sqrt(Z)
float* rcov = nullptr; // covalent radii
int* mxc = nullptr; // How large the grid for c6 interpolation
float** r0ab = nullptr; // cut-off radii for all element pairs
float***** c6ab = nullptr; // C6 for all element pairs
float rthr; // R^2 distance to cutoff for C calculation
float cnthr; // R^2 distance to cutoff for CN_calculation
float s6, s8, s18, rs6, rs8, rs18, alp, alp6, alp8, a1, a2; // parameters for D3
/*--------- Parameters to read ---------*/
/*--------- Lattice related values ---------*/
double* lat_v_1 = nullptr; // lattice coordination vector
double* lat_v_2 = nullptr; // lattice coordination vector
double* lat_v_3 = nullptr; // lattice coordination vector
int* rep_vdw = nullptr; // repetition of cell for calculating D3
int* rep_cn = nullptr; // repetition of cell for calculating
double** sigma = nullptr; // virial pressure on cell
/*--------- Lattice related values ---------*/
/*--------- Per-atom values/arrays ---------*/
double* cn = nullptr; // Coordination numbers
float** x = nullptr; // Positions
double** f = nullptr; // Forces
double* dc6i = nullptr; // dC6i(iat) saves dE_dsp/dCN(iat)
/*--------- Per-atom values/arrays ---------*/
/*--------- Per-pair values/arrays ---------*/
float* c6_ij_tot = nullptr;
float* dc6_iji_tot = nullptr;
float* dc6_ijj_tot = nullptr;
/*--------- Per-pair values/arrays ---------*/
/*---------- Global values ---------*/
int n_save; // to check whether the number of atoms has changed
int np1_save; // to check whether the number of types has changed
float disp_total; // Dispersion energy
/*---------- Global values ---------*/
/*--------- For loop over tau (translation of cell) ---------*/
float**** tau_vdw = nullptr;
float**** tau_cn = nullptr;
int* tau_idx_vdw = nullptr;
int* tau_idx_cn = nullptr;
int tau_idx_vdw_total_size;
int tau_idx_cn_total_size;
/*--------- For loop over tau (translation of cell) ---------*/
/*--------- For cuda memory transfer (pointerized) ---------*/
int *atomtype;
double *disp;
/*--------- For cuda memory transfer (pointerized) ---------*/
};
#endif // LMP_PAIR_D3
This source diff could not be displayed because it is too large. You can view the blob instead.
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://lammps.sandia.gov/, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Yutack Park (SNU)
------------------------------------------------------------------------- */
#include <ATen/ops/from_blob.h>
#include <c10/core/Scalar.h>
#include <c10/core/TensorOptions.h>
#include <string>
#include <torch/script.h>
#include <torch/torch.h>
#include "atom.h"
#include "domain.h"
#include "error.h"
#include "force.h"
#include "memory.h"
#include "neigh_list.h"
#include "neigh_request.h"
#include "neighbor.h"
#include "pair_e3gnn.h"
using namespace LAMMPS_NS;
#define INTEGER_TYPE torch::TensorOptions().dtype(torch::kInt64)
#define FLOAT_TYPE torch::TensorOptions().dtype(torch::kFloat)
PairE3GNN::PairE3GNN(LAMMPS *lmp) : Pair(lmp) {
// constructor
const char *print_flag = std::getenv("SEVENN_PRINT_INFO");
if (print_flag)
print_info = true;
std::string device_name;
if (torch::cuda::is_available()) {
device = torch::kCUDA;
device_name = "CUDA";
} else {
device = torch::kCPU;
device_name = "CPU";
}
if (lmp->logfile) {
fprintf(lmp->logfile, "PairE3GNN using device : %s\n", device_name.c_str());
}
}
PairE3GNN::~PairE3GNN() {
if (allocated) {
memory->destroy(setflag);
memory->destroy(cutsq);
memory->destroy(map);
memory->destroy(elements);
}
}
void PairE3GNN::compute(int eflag, int vflag) {
// compute
/*
This compute function is ispired/modified from stress branch of pair-nequip
https://github.com/mir-group/pair_nequip
*/
if (eflag || vflag)
ev_setup(eflag, vflag);
else
evflag = vflag_fdotr = 0;
if (vflag_atom) {
error->all(FLERR, "atomic stress is not supported\n");
}
int nlocal = list->inum; // same as nlocal
int *ilist = list->ilist;
tagint *tag = atom->tag;
std::unordered_map<int, int> tag_map;
if (atom->tag_consecutive() == 0) {
for (int ii = 0; ii < nlocal; ii++) {
const int i = ilist[ii];
int itag = tag[i];
tag_map[itag] = ii+1;
// printf("MODIFY setting %i => %i \n",itag, tag_map[itag] );
}
} else {
//Ordered which mappling required
for (int ii = 0; ii < nlocal; ii++) {
const int itag = ilist[ii]+1;
tag_map[itag] = ii+1;
// printf("normal setting %i => %i \n",itag, tag_map[itag] );
}
}
double **x = atom->x;
double **f = atom->f;
int *type = atom->type;
long num_atoms[1] = {nlocal};
int tag2i[nlocal];
int *numneigh = list->numneigh; // j loop cond
int **firstneigh = list->firstneigh; // j list
int bound;
if (this->nedges_bound == -1) {
bound = std::accumulate(numneigh, numneigh + nlocal, 0);
} else {
bound = this->nedges_bound;
}
const int nedges_upper_bound = bound;
float cell[3][3];
cell[0][0] = domain->boxhi[0] - domain->boxlo[0];
cell[0][1] = 0.0;
cell[0][2] = 0.0;
cell[1][0] = domain->xy;
cell[1][1] = domain->boxhi[1] - domain->boxlo[1];
cell[1][2] = 0.0;
cell[2][0] = domain->xz;
cell[2][1] = domain->yz;
cell[2][2] = domain->boxhi[2] - domain->boxlo[2];
torch::Tensor inp_cell = torch::from_blob(cell, {3, 3}, FLOAT_TYPE);
torch::Tensor inp_num_atoms = torch::from_blob(num_atoms, {1}, INTEGER_TYPE);
torch::Tensor inp_node_type = torch::zeros({nlocal}, INTEGER_TYPE);
torch::Tensor inp_pos = torch::zeros({nlocal, 3});
torch::Tensor inp_cell_volume =
torch::dot(inp_cell[0], torch::cross(inp_cell[1], inp_cell[2], 0));
float pbc_shift_tmp[nedges_upper_bound][3];
auto node_type = inp_node_type.accessor<long, 1>();
auto pos = inp_pos.accessor<float, 2>();
long edge_idx_src[nedges_upper_bound];
long edge_idx_dst[nedges_upper_bound];
int nedges = 0;
for (int ii = 0; ii < nlocal; ii++) {
const int i = ilist[ii];
int itag = tag_map[tag[i]];
tag2i[itag - 1] = i;
const int itype = type[i];
node_type[itag - 1] = map[itype];
pos[itag - 1][0] = x[i][0];
pos[itag - 1][1] = x[i][1];
pos[itag - 1][2] = x[i][2];
}
for (int ii = 0; ii < nlocal; ii++) {
const int i = ilist[ii];
int itag = tag_map[tag[i]];
const int *jlist = firstneigh[i];
const int jnum = numneigh[i];
for (int jj = 0; jj < jnum; jj++) {
int j = jlist[jj]; // atom over pbc is different atom
int jtag = tag_map[tag[j]]; // atom over pbs is same atom (it starts from 1)
j &= NEIGHMASK;
const int jtype = type[j];
const double delij[3] = {x[j][0] - x[i][0], x[j][1] - x[i][1],
x[j][2] - x[i][2]};
const double Rij =
delij[0] * delij[0] + delij[1] * delij[1] + delij[2] * delij[2];
if (Rij < cutoff_square) {
edge_idx_src[nedges] = itag - 1;
edge_idx_dst[nedges] = jtag - 1;
pbc_shift_tmp[nedges][0] = x[j][0] - pos[jtag - 1][0];
pbc_shift_tmp[nedges][1] = x[j][1] - pos[jtag - 1][1];
pbc_shift_tmp[nedges][2] = x[j][2] - pos[jtag - 1][2];
nedges++;
}
} // j loop end
} // i loop end
auto edge_idx_src_tensor =
torch::from_blob(edge_idx_src, {nedges}, INTEGER_TYPE);
auto edge_idx_dst_tensor =
torch::from_blob(edge_idx_dst, {nedges}, INTEGER_TYPE);
auto inp_edge_index =
torch::stack({edge_idx_src_tensor, edge_idx_dst_tensor});
// r' = r + {shift_tensor(integer vector of len 3)} @ cell_tensor
// shift_tensor = (cell_tensor)^-1^T @ (r' - r)
torch::Tensor cell_inv_tensor =
inp_cell.inverse().transpose(0, 1).unsqueeze(0).to(device);
torch::Tensor pbc_shift_tmp_tensor =
torch::from_blob(pbc_shift_tmp, {nedges, 3}, FLOAT_TYPE)
.view({nedges, 3, 1})
.to(device);
torch::Tensor inp_cell_shift =
torch::bmm(cell_inv_tensor.expand({nedges, 3, 3}), pbc_shift_tmp_tensor)
.view({nedges, 3});
inp_pos.set_requires_grad(true);
c10::Dict<std::string, torch::Tensor> input_dict;
input_dict.insert("x", inp_node_type.to(device));
input_dict.insert("pos", inp_pos.to(device));
input_dict.insert("edge_index", inp_edge_index.to(device));
input_dict.insert("num_atoms", inp_num_atoms.to(device));
input_dict.insert("cell_lattice_vectors", inp_cell.to(device));
input_dict.insert("cell_volume", inp_cell_volume.to(device));
input_dict.insert("pbc_shift", inp_cell_shift);
std::vector<torch::IValue> input(1, input_dict);
auto output = model.forward(input).toGenericDict();
torch::Tensor total_energy_tensor =
output.at("inferred_total_energy").toTensor().cpu();
torch::Tensor force_tensor = output.at("inferred_force").toTensor().cpu();
auto forces = force_tensor.accessor<float, 2>();
eng_vdwl += total_energy_tensor.item<float>();
for (int itag = 0; itag < nlocal; itag++) {
int i = tag2i[itag];
f[i][0] += forces[itag][0];
f[i][1] += forces[itag][1];
f[i][2] += forces[itag][2];
}
if (vflag) {
// more accurately, it is virial part of stress
torch::Tensor stress_tensor = output.at("inferred_stress").toTensor().cpu();
auto virial_stress_tensor = stress_tensor * inp_cell_volume;
// xy yz zx order in vasp (voigt is xx yy zz yz xz xy)
auto virial_stress = virial_stress_tensor.accessor<float, 1>();
virial[0] += virial_stress[0];
virial[1] += virial_stress[1];
virial[2] += virial_stress[2];
virial[3] += virial_stress[3];
virial[4] += virial_stress[5];
virial[5] += virial_stress[4];
}
if (eflag_atom) {
torch::Tensor atomic_energy_tensor =
output.at("atomic_energy").toTensor().cpu().squeeze();
auto atomic_energy = atomic_energy_tensor.accessor<float, 1>();
for (int itag = 0; itag < nlocal; itag++) {
int i = tag2i[itag];
eatom[i] += atomic_energy[itag];
}
}
// if it was the first MD step
if (this->nedges_bound == -1) {
this->nedges_bound = nedges * 1.2;
} // else if the nedges is too small, increase the bound
else if (nedges > this->nedges_bound / 1.2) {
this->nedges_bound = nedges * 1.2;
}
}
// allocate arrays (called from coeff)
void PairE3GNN::allocate() {
allocated = 1;
int n = atom->ntypes;
memory->create(setflag, n + 1, n + 1, "pair:setflag");
memory->create(cutsq, n + 1, n + 1, "pair:cutsq");
memory->create(map, n + 1, "pair:map");
}
// global settings for pair_style
void PairE3GNN::settings(int narg, char **arg) {
if (narg != 0) {
error->all(FLERR, "Illegal pair_style command");
}
}
void PairE3GNN::coeff(int narg, char **arg) {
if (allocated) {
error->all(FLERR, "pair_e3gnn coeff called twice");
}
allocate();
if (strcmp(arg[0], "*") != 0 || strcmp(arg[1], "*") != 0) {
error->all(FLERR,
"e3gnn: first and second input of pair_coeff should be '*'");
}
// expected input : pair_coeff * * pot.pth type_name1 type_name2 ...
std::unordered_map<std::string, std::string> meta_dict = {
{"chemical_symbols_to_index", ""},
{"cutoff", ""},
{"num_species", ""},
{"model_type", ""},
{"version", ""},
{"dtype", ""},
{"time", ""}};
// model loading from input
try {
model = torch::jit::load(std::string(arg[2]), device, meta_dict);
} catch (const c10::Error &e) {
error->all(FLERR, "error loading the model, check the path of the model");
}
// model = torch::jit::freeze(model); model is already freezed
torch::jit::setGraphExecutorOptimize(false);
torch::jit::FusionStrategy strategy;
// thing about dynamic recompile as tensor shape varies, this is default
// strategy = {{torch::jit::FusionBehavior::DYNAMIC, 3}};
strategy = {{torch::jit::FusionBehavior::STATIC, 0}};
torch::jit::setFusionStrategy(strategy);
cutoff = std::stod(meta_dict["cutoff"]);
cutoff_square = cutoff * cutoff;
if (meta_dict["model_type"].compare("E3_equivariant_model") != 0) {
error->all(FLERR, "given model type is not E3_equivariant_model");
}
std::string chem_str = meta_dict["chemical_symbols_to_index"];
int ntypes = atom->ntypes;
auto delim = " ";
char *tok = std::strtok(const_cast<char *>(chem_str.c_str()), delim);
std::vector<std::string> chem_vec;
while (tok != nullptr) {
chem_vec.push_back(std::string(tok));
tok = std::strtok(nullptr, delim);
}
bool found_flag = false;
for (int i = 3; i < narg; i++) {
found_flag = false;
for (int j = 0; j < chem_vec.size(); j++) {
if (chem_vec[j].compare(arg[i]) == 0) {
map[i - 2] = j;
found_flag = true;
fprintf(lmp->logfile, "Chemical specie '%s' is assigned to type %d\n",
arg[i], i - 2);
break;
}
}
if (!found_flag) {
error->all(FLERR, "Unknown chemical specie is given");
}
}
if (ntypes > narg - 3) {
error->all(FLERR, "Not enough chemical specie is given. Check pair_coeff "
"and types in your data/script");
}
for (int i = 1; i <= ntypes; i++) {
for (int j = 1; j <= ntypes; j++) {
if ((map[i] >= 0) && (map[j] >= 0)) {
setflag[i][j] = 1;
cutsq[i][j] = cutoff * cutoff;
}
}
}
if (lmp->logfile) {
fprintf(lmp->logfile, "from sevenn version '%s' ",
meta_dict["version"].c_str());
fprintf(lmp->logfile, "%s precision model, deployed when: %s\n",
meta_dict["dtype"].c_str(), meta_dict["time"].c_str());
}
}
// init specific to this pair
void PairE3GNN::init_style() {
// Newton flag is irrelevant if use only one processor for simulation
/*
if (force->newton_pair == 0) {
error->all(FLERR, "Pair style nn requires newton pair on");
}
*/
// full neighbor list (this is many-body potential)
neighbor->add_request(this, NeighConst::REQ_FULL);
}
double PairE3GNN::init_one(int i, int j) { return cutoff; }
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
PairStyle(e3gnn, PairE3GNN)
#else
#ifndef LMP_PAIR_E3GNN
#define LMP_PAIR_E3GNN
#include "pair.h"
#include <torch/torch.h>
namespace LAMMPS_NS {
class PairE3GNN : public Pair {
private:
double cutoff;
double cutoff_square;
torch::jit::Module model;
torch::Device device = torch::kCPU;
int nelements;
bool print_info = false;
int nedges_bound = -1;
public:
PairE3GNN(class LAMMPS *);
~PairE3GNN();
void compute(int, int);
void settings(int, char **);
// read Atom type string from input script & related coeff
void coeff(int, char **);
void allocate();
void init_style();
double init_one(int, int);
};
} // namespace LAMMPS_NS
#endif
#endif
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://lammps.sandia.gov/, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Yutack Park (SNU)
------------------------------------------------------------------------- */
#include <ATen/core/Dict.h>
#include <ATen/core/ivalue_inl.h>
#include <ATen/ops/from_blob.h>
#include <c10/core/Scalar.h>
#include <c10/core/TensorOptions.h>
#include <cstdlib>
#include <filesystem>
#include <numeric>
#include <string>
#include <torch/csrc/jit/api/module.h>
#include <torch/script.h>
#include <torch/torch.h>
#include <cuda_runtime.h>
#include "atom.h"
#include "comm.h"
#include "comm_brick.h"
#include "error.h"
#include "force.h"
#include "memory.h"
#include "neigh_list.h"
#include "neighbor.h"
// #include "nvToolsExt.h"
#include "pair_e3gnn_parallel.h"
#include <cassert>
#ifdef OMPI_MPI_H
#include "mpi-ext.h" //This should be included after mpi.h which is included in pair.h
#endif
using namespace LAMMPS_NS;
#define INTEGER_TYPE torch::TensorOptions().dtype(torch::kInt64)
#define FLOAT_TYPE torch::TensorOptions().dtype(torch::kFloat)
DeviceBuffManager &DeviceBuffManager::getInstance() {
static DeviceBuffManager instance;
return instance;
}
void DeviceBuffManager::get_buffer(int send_size, int recv_size,
float *&buf_send_ptr, float *&buf_recv_ptr) {
if (send_size > send_buf_size) {
cudaFree(buf_send_device);
cudaError_t cuda_err =
cudaMalloc(&buf_send_device, send_size * sizeof(float));
send_buf_size = send_size;
}
if (recv_size > recv_buf_size) {
cudaFree(buf_recv_device);
cudaError_t cuda_err =
cudaMalloc(&buf_recv_device, recv_size * sizeof(float));
recv_buf_size = recv_size;
}
buf_send_ptr = buf_send_device;
buf_recv_ptr = buf_recv_device;
}
DeviceBuffManager::~DeviceBuffManager() {
cudaFree(buf_send_device);
cudaFree(buf_recv_device);
}
PairE3GNNParallel::PairE3GNNParallel(LAMMPS *lmp) : Pair(lmp) {
// constructor
const char *print_flag = std::getenv("SEVENN_PRINT_INFO");
const char *print_both_flag = std::getenv("SEVENN_PRINT_BOTH_INFO");
if (print_flag) {
world_rank = comm->me;
std::cout << "process rank: " << world_rank << " initialized" << std::endl;
print_info = (world_rank == 0) || print_both_flag;
}
std::string device_name;
const bool use_gpu = torch::cuda::is_available();
comm_forward = 0;
comm_reverse = 0;
// OpenMPI detection
#ifdef OMPI_MPI_H
#if defined(MPIX_CUDA_AWARE_SUPPORT)
if (1 == MPIX_Query_cuda_support()) {
use_cuda_mpi = true;
} else {
use_cuda_mpi = false;
}
#else
use_cuda_mpi = false;
#endif
#else
use_cuda_mpi = false;
#endif
// use_cuda_mpi = use_gpu && use_cuda_mpi;
// if (use_cuda_mpi) {
if (use_gpu) {
device = get_cuda_device();
device_name = "CUDA";
} else {
device = torch::kCPU;
device_name = "CPU";
}
if (std::getenv("OFF_E3GNN_PARALLEL_CUDA_MPI")) {
use_cuda_mpi = false;
}
if (lmp->screen) {
if (use_gpu && !use_cuda_mpi) {
device_comm = torch::kCPU;
fprintf(lmp->screen,
"cuda-aware mpi not found, communicate via host device\n");
} else {
device_comm = device;
}
fprintf(lmp->screen, "PairE3GNNParallel using device : %s\n",
device_name.c_str());
fprintf(lmp->screen, "PairE3GNNParallel cuda-aware mpi: %s\n",
use_cuda_mpi ? "True" : "False");
}
if (lmp->logfile) {
if (use_gpu && !use_cuda_mpi) {
device_comm = torch::kCPU;
fprintf(lmp->logfile,
"cuda-aware mpi not found, communicate via host device\n");
} else {
device_comm = device;
}
fprintf(lmp->logfile, "PairE3GNNParallel using device : %s\n",
device_name.c_str());
fprintf(lmp->logfile, "PairE3GNNParallel cuda-aware mpi: %s\n",
use_cuda_mpi ? "True" : "False");
}
}
torch::Device PairE3GNNParallel::get_cuda_device() {
char *cuda_visible = std::getenv("CUDA_VISIBLE_DEVICES");
int num_gpus;
int idx;
int rank = comm->me;
num_gpus = torch::cuda::device_count();
idx = rank % num_gpus;
if (print_info)
std::cout << world_rank << " Available # of GPUs found: " << num_gpus
<< std::endl;
cudaError_t cuda_err = cudaSetDevice(idx);
if (cuda_err != cudaSuccess) {
std::cerr << "E3GNN: Failed to set CUDA device: "
<< cudaGetErrorString(cuda_err) << std::endl;
}
return torch::Device(torch::kCUDA, idx);
}
PairE3GNNParallel::~PairE3GNNParallel() {
if (allocated) {
memory->destroy(setflag);
memory->destroy(cutsq);
memory->destroy(map);
}
}
int PairE3GNNParallel::get_x_dim() { return x_dim; }
bool PairE3GNNParallel::use_cuda_mpi_() { return use_cuda_mpi; }
bool PairE3GNNParallel::is_comm_preprocess_done() {
return comm_preprocess_done;
}
void PairE3GNNParallel::compute(int eflag, int vflag) {
/*
Graph build on cpu
*/
if (eflag || vflag)
ev_setup(eflag, vflag);
else
evflag = vflag_fdotr = 0;
if (vflag_atom) {
error->all(FLERR, "atomic stress is not supported\n");
}
if (atom->tag_consecutive() == 0) {
error->all(FLERR, "Pair e3gnn requires consecutive atom IDs");
}
double **x = atom->x;
double **f = atom->f;
int *type = atom->type;
int nlocal = list->inum; // same as nlocal
int nghost = atom->nghost;
int ntotal = nlocal + nghost;
int *ilist = list->ilist;
int inum = list->inum;
CommBrick *comm_brick = dynamic_cast<CommBrick *>(comm);
if (comm_brick == nullptr) {
error->all(FLERR, "e3gnn/parallel: comm style should be brick & from "
"modified code of comm_brick");
}
bigint natoms = atom->natoms;
// tag ignore PBC
tagint *tag = atom->tag;
// store graph_idx from local to known ghost atoms(ghost atoms inside cutoff)
int tag_to_graph_idx[natoms + 1]; // tag starts from 1 not 0
std::fill_n(tag_to_graph_idx, natoms + 1, -1);
// to access tag_to_graph_idx from comm
tag_to_graph_idx_ptr = tag_to_graph_idx;
int graph_indexer = nlocal;
int graph_index_to_i[ntotal];
int *numneigh = list->numneigh; // j loop cond
int **firstneigh = list->firstneigh; // j list
const int nedges_upper_bound =
std::accumulate(numneigh, numneigh + nlocal, 0);
std::vector<long> node_type;
std::vector<long> node_type_ghost;
float edge_vec[nedges_upper_bound][3];
long edge_idx_src[nedges_upper_bound];
long edge_idx_dst[nedges_upper_bound];
int nedges = 0;
for (int ii = 0; ii < inum; ii++) {
// populate tag_to_graph_idx of local atoms
const int i = ilist[ii];
const int itag = tag[i];
const int itype = type[i];
tag_to_graph_idx[itag] = ii;
graph_index_to_i[ii] = i;
node_type.push_back(map[itype]);
}
// loop over neighbors, build graph
for (int ii = 0; ii < inum; ii++) {
const int i = ilist[ii];
const int i_graph_idx = ii;
const int *jlist = firstneigh[i];
const int jnum = numneigh[i];
for (int jj = 0; jj < jnum; jj++) {
int j = jlist[jj];
const int jtag = tag[j];
j &= NEIGHMASK;
const int jtype = type[j];
// we have to calculate Rij to check cutoff in lammps side
const double delij[3] = {x[j][0] - x[i][0], x[j][1] - x[i][1],
x[j][2] - x[i][2]};
const double Rij =
delij[0] * delij[0] + delij[1] * delij[1] + delij[2] * delij[2];
int j_graph_idx;
if (Rij < cutoff_square) {
// if given j is not local atom and inside cutoff
if (tag_to_graph_idx[jtag] == -1) {
// if j is ghost atom inside cutoff but first seen
tag_to_graph_idx[jtag] = graph_indexer;
graph_index_to_i[graph_indexer] = j;
node_type_ghost.push_back(map[jtype]);
graph_indexer++;
}
j_graph_idx = tag_to_graph_idx[jtag];
edge_idx_src[nedges] = i_graph_idx;
edge_idx_dst[nedges] = j_graph_idx;
edge_vec[nedges][0] = delij[0];
edge_vec[nedges][1] = delij[1];
edge_vec[nedges][2] = delij[2];
nedges++;
}
} // j loop end
} // i loop end
// member variable
graph_size = graph_indexer;
const int ghost_node_num = graph_size - nlocal;
// convert data to Tensor
auto inp_node_type = torch::from_blob(node_type.data(), nlocal, INTEGER_TYPE);
auto inp_node_type_ghost =
torch::from_blob(node_type_ghost.data(), ghost_node_num, INTEGER_TYPE);
long num_nodes[1] = {long(nlocal)};
auto inp_num_atoms = torch::from_blob(num_nodes, {1}, INTEGER_TYPE);
auto edge_idx_src_tensor =
torch::from_blob(edge_idx_src, {nedges}, INTEGER_TYPE);
auto edge_idx_dst_tensor =
torch::from_blob(edge_idx_dst, {nedges}, INTEGER_TYPE);
auto inp_edge_index =
torch::stack({edge_idx_src_tensor, edge_idx_dst_tensor});
auto inp_edge_vec = torch::from_blob(edge_vec, {nedges, 3}, FLOAT_TYPE);
if (print_info) {
std::cout << world_rank << " Nlocal: " << nlocal << std::endl;
std::cout << world_rank << " Graph_size: " << graph_size << std::endl;
std::cout << world_rank << " Ghost_node_num: " << ghost_node_num
<< std::endl;
std::cout << world_rank << " Nedges: " << nedges << "\n" << std::endl;
}
// r_original requires grad True
inp_edge_vec.set_requires_grad(true);
torch::Dict<std::string, torch::Tensor> input_dict;
input_dict.insert("x", inp_node_type.to(device));
input_dict.insert("x_ghost", inp_node_type_ghost.to(device));
input_dict.insert("edge_index", inp_edge_index.to(device));
input_dict.insert("edge_vec", inp_edge_vec.to(device));
input_dict.insert("num_atoms", inp_num_atoms.to(device));
input_dict.insert("nlocal", inp_num_atoms.to(torch::kCPU));
std::list<std::vector<torch::Tensor>> wrt_tensors;
wrt_tensors.push_back({input_dict.at("edge_vec")});
auto model_part = model_list.front();
auto output = model_part.forward({input_dict}).toGenericDict();
comm_preprocess();
// extra_graph_idx_map is set from comm_preprocess();
// last one is for trash values. See pack_forward_init
const int extra_size =
ghost_node_num + static_cast<int>(extra_graph_idx_map.size()) + 1;
torch::Tensor x_local;
torch::Tensor x_ghost;
for (auto it = model_list.begin(); it != model_list.end(); ++it) {
if (it == model_list.begin())
continue;
model_part = *it;
x_local = output.at("x").toTensor().detach().to(device);
x_dim = x_local.size(1); // length of per atom vector(node feature)
auto ghost_and_extra_x = torch::zeros({ghost_node_num + extra_size, x_dim},
FLOAT_TYPE.device(device));
x_comm = torch::cat({x_local, ghost_and_extra_x}, 0).to(device_comm);
comm_brick->forward_comm(this); // populate x_ghost by communication
// What we got from forward_comm (node feature of ghosts)
x_ghost = torch::split_with_sizes(
x_comm, {nlocal, ghost_node_num, extra_size}, 0)[1];
x_ghost.set_requires_grad(true);
// prepare next input (output > next input)
output.insert_or_assign("x_ghost", x_ghost.to(device));
// make another edge_vec to discriminate grad calculation with other
// edge_vecs(maybe redundant?)
output.insert_or_assign("edge_vec",
output.at("edge_vec").toTensor().clone());
// save tensors for backprop
wrt_tensors.push_back({output.at("edge_vec").toTensor(),
output.at("x").toTensor(),
output.at("self_cont_tmp").toTensor(),
output.at("x_ghost").toTensor()});
output = model_part.forward({output}).toGenericDict();
}
torch::Tensor energy_tensor =
output.at("inferred_total_energy").toTensor().squeeze();
torch::Tensor dE_dr =
torch::zeros({nedges, 3}, FLOAT_TYPE.device(device)); // create on device
torch::Tensor x_local_save; // holds grad info of x_local (it loses its grad
// when sends to CPU)
torch::Tensor self_conn_grads;
std::vector<torch::Tensor> grads;
std::vector<torch::Tensor> of_tensor;
// TODO: most values of self_conn_grads were zero because we use only scalars
// for energy
for (auto rit = wrt_tensors.rbegin(); rit != wrt_tensors.rend(); ++rit) {
// edge_vec, x, x_ghost order
auto wrt_tensor = *rit;
if (rit == wrt_tensors.rbegin()) {
grads = torch::autograd::grad({energy_tensor}, wrt_tensor);
} else {
x_local_save.copy_(x_local);
// of wrt grads_output
grads = torch::autograd::grad(of_tensor, wrt_tensor,
{x_local_save, self_conn_grads});
}
dE_dr = dE_dr + grads.at(0); // accumulate force
if (std::distance(rit, wrt_tensors.rend()) == 1)
continue; // if last iteration
of_tensor.clear();
of_tensor.push_back(wrt_tensor[1]); // x
of_tensor.push_back(wrt_tensor[2]); // self_cont_tmp
x_local_save = grads.at(1); // for grads_output
x_local = x_local_save.detach(); // grad_outputs & communication
x_dim = x_local.size(1);
self_conn_grads = grads.at(2); // no communication, for grads_output
x_ghost = grads.at(3).detach(); // yes communication, not for grads_output
auto extra_x = torch::zeros({extra_size, x_dim}, FLOAT_TYPE.device(device));
x_comm = torch::cat({x_local, x_ghost, extra_x}, 0).to(device_comm);
comm_brick->reverse_comm(this); // completes x_local
// now x_local is complete (dE_dx), become next grads_output(with
// self_conn_grads)
x_local = torch::split_with_sizes(
x_comm, {nlocal, ghost_node_num, extra_size}, 0)[0];
}
// postprocessing
if (print_info) {
size_t free, tot;
cudaMemGetInfo(&free, &tot);
std::cout << world_rank << " MEM use after backward(MB)" << std::endl;
double Mfree = static_cast<double>(free) / (1024 * 1024);
double Mtot = static_cast<double>(tot) / (1024 * 1024);
std::cout << world_rank << " Total: " << Mtot << std::endl;
std::cout << world_rank << " Free: " << Mfree << std::endl;
std::cout << world_rank << " Used: " << Mtot - Mfree << std::endl;
double Mused = Mtot - Mfree;
std::cout << world_rank << " Used/Nedges: " << Mused / nedges << std::endl;
std::cout << world_rank << " Used/Nlocal: " << Mused / nlocal << std::endl;
std::cout << world_rank << " Used/GraphSize: " << Mused / graph_size << "\n"
<< std::endl;
}
eng_vdwl += energy_tensor.item<float>(); // accumulate energy
dE_dr = dE_dr.to(torch::kCPU);
torch::Tensor force_tensor = torch::zeros({graph_indexer, 3});
auto _edge_idx_src_tensor =
edge_idx_src_tensor.repeat_interleave(3).view({nedges, 3});
auto _edge_idx_dst_tensor =
edge_idx_dst_tensor.repeat_interleave(3).view({nedges, 3});
force_tensor.scatter_reduce_(0, _edge_idx_src_tensor, dE_dr, "sum");
force_tensor.scatter_reduce_(0, _edge_idx_dst_tensor, torch::neg(dE_dr),
"sum");
auto forces = force_tensor.accessor<float, 2>();
for (int graph_idx = 0; graph_idx < graph_indexer; graph_idx++) {
int i = graph_index_to_i[graph_idx];
f[i][0] += forces[graph_idx][0];
f[i][1] += forces[graph_idx][1];
f[i][2] += forces[graph_idx][2];
}
if (vflag) {
auto diag = inp_edge_vec * dE_dr;
auto s12 = inp_edge_vec.select(1, 0) * dE_dr.select(1, 1);
auto s23 = inp_edge_vec.select(1, 1) * dE_dr.select(1, 2);
auto s31 = inp_edge_vec.select(1, 2) * dE_dr.select(1, 0);
std::vector<torch::Tensor> voigt_list = {
diag, s12.unsqueeze(-1), s23.unsqueeze(-1), s31.unsqueeze(-1)};
auto voigt = torch::cat(voigt_list, 1);
torch::Tensor per_atom_stress_tensor = torch::zeros({graph_indexer, 6});
auto _edge_idx_dst6_tensor =
edge_idx_dst_tensor.repeat_interleave(6).view({nedges, 6});
per_atom_stress_tensor.scatter_reduce_(0, _edge_idx_dst6_tensor, voigt,
"sum");
auto virial_stress_tensor =
torch::neg(torch::sum(per_atom_stress_tensor, 0));
auto virial_stress = virial_stress_tensor.accessor<float, 1>();
virial[0] += virial_stress[0];
virial[1] += virial_stress[1];
virial[2] += virial_stress[2];
virial[3] += virial_stress[3];
virial[4] += virial_stress[5];
virial[5] += virial_stress[4];
}
if (eflag_atom) {
torch::Tensor atomic_energy_tensor =
output.at("atomic_energy").toTensor().cpu().squeeze();
auto atomic_energy = atomic_energy_tensor.accessor<float, 1>();
for (int graph_idx = 0; graph_idx < nlocal; graph_idx++) {
int i = graph_index_to_i[graph_idx];
eatom[i] += atomic_energy[graph_idx];
}
}
// clean up comm preprocess variables
comm_preprocess_done = false;
for (int i = 0; i < 6; i++) {
// array of vector<long>
comm_index_pack_forward[i].clear();
comm_index_unpack_forward[i].clear();
comm_index_unpack_reverse[i].clear();
}
extra_graph_idx_map.clear();
}
// allocate arrays (called from coeff)
void PairE3GNNParallel::allocate() {
allocated = 1;
int n = atom->ntypes;
memory->create(setflag, n + 1, n + 1, "pair:setflag");
memory->create(cutsq, n + 1, n + 1, "pair:cutsq");
memory->create(map, n + 1, "pair:map");
}
// global settings for pair_style
void PairE3GNNParallel::settings(int narg, char **arg) {
if (narg != 0) {
error->all(FLERR, "Illegal pair_style command");
}
}
void PairE3GNNParallel::coeff(int narg, char **arg) {
if (allocated) {
error->all(FLERR, "pair_e3gnn coeff called twice");
}
allocate();
if (strcmp(arg[0], "*") != 0 || strcmp(arg[1], "*") != 0) {
error->all(FLERR,
"e3gnn: first and second input of pair_coeff should be '*'");
}
// expected input : pair_coeff * * pot.pth type_name1 type_name2 ...
std::unordered_map<std::string, std::string> meta_dict = {
{"chemical_symbols_to_index", ""},
{"cutoff", ""},
{"num_species", ""},
{"model_type", ""},
{"version", ""},
{"dtype", ""},
{"time", ""},
{"comm_size", ""}};
// model loading from input
int n_model = std::stoi(arg[2]);
int chem_arg_i = 4;
std::vector<std::string> model_fnames;
if (std::filesystem::exists(arg[3])) {
if (std::filesystem::is_directory(arg[3])) {
auto headf = std::string(arg[3]);
for (int i = 0; i < n_model; i++) {
auto stri = std::to_string(i);
model_fnames.push_back(headf + "/deployed_parallel_" + stri + ".pt");
}
} else if (std::filesystem::is_regular_file(arg[3])) {
for (int i = 3; i < n_model + 3; i++) {
model_fnames.push_back(std::string(arg[i]));
}
chem_arg_i = n_model + 3;
} else {
error->all(FLERR, "No such file or directory:" + std::string(arg[3]));
}
}
for (const auto &modelf : model_fnames) {
if (!std::filesystem::is_regular_file(modelf)) {
error->all(FLERR, "Expected this is a regular file:" + modelf);
}
model_list.push_back(torch::jit::load(modelf, device, meta_dict));
}
torch::jit::setGraphExecutorOptimize(false);
torch::jit::FusionStrategy strategy;
// strategy = {{torch::jit::FusionBehavior::DYNAMIC, 3}};
strategy = {{torch::jit::FusionBehavior::STATIC, 0}};
torch::jit::setFusionStrategy(strategy);
cutoff = std::stod(meta_dict["cutoff"]);
// maximum possible size of per atom x before last convolution
int comm_size = std::stod(meta_dict["comm_size"]);
// to initialize buffer size for communication
comm_forward = comm_size;
comm_reverse = comm_size;
cutoff_square = cutoff * cutoff;
if (meta_dict["model_type"].compare("E3_equivariant_model") != 0) {
error->all(FLERR, "given model type is not E3_equivariant_model");
}
std::string chem_str = meta_dict["chemical_symbols_to_index"];
int ntypes = atom->ntypes;
auto delim = " ";
char *tok = std::strtok(const_cast<char *>(chem_str.c_str()), delim);
std::vector<std::string> chem_vec;
while (tok != nullptr) {
chem_vec.push_back(std::string(tok));
tok = std::strtok(nullptr, delim);
}
// what if unknown chemical specie is in arg? should I abort? is there any use
// case for that?
bool found_flag = false;
int n_chem = narg - chem_arg_i;
for (int i = 0; i < n_chem; i++) {
found_flag = false;
for (int j = 0; j < chem_vec.size(); j++) {
if (chem_vec[j].compare(arg[i + chem_arg_i]) == 0) {
map[i + 1] = j; // store from 1, (not 0)
found_flag = true;
if (lmp->logfile) {
fprintf(lmp->logfile, "Chemical specie '%s' is assigned to type %d\n",
arg[i + chem_arg_i], i + 1);
break;
}
}
}
if (!found_flag) {
error->all(FLERR, "Unknown chemical specie is given or the number of "
"potential files is not consistent");
}
}
for (int i = 1; i <= ntypes; i++) {
for (int j = 1; j <= ntypes; j++) {
if ((map[i] >= 0) && (map[j] >= 0)) {
setflag[i][j] = 1;
cutsq[i][j] = cutoff * cutoff;
}
}
}
if (lmp->logfile) {
fprintf(lmp->logfile, "from sevenn version '%s' ",
meta_dict["version"].c_str());
fprintf(lmp->logfile, "%s precision model, deployed when: %s\n",
meta_dict["dtype"].c_str(), meta_dict["time"].c_str());
}
}
// init specific to this pair
void PairE3GNNParallel::init_style() {
// full neighbor list & newton on
if (force->newton_pair == 0) {
error->all(FLERR, "Pair style e3gnn/parallel requires newton pair on");
}
neighbor->add_request(this, NeighConst::REQ_FULL);
}
double PairE3GNNParallel::init_one(int i, int j) { return cutoff; }
void PairE3GNNParallel::notify_proc_ids(const int *sendproc, const int *recvproc) {
for (int iswap = 0; iswap < 6; iswap++) {
this->sendproc[iswap] = sendproc[iswap];
this->recvproc[iswap]= recvproc[iswap];
}
}
void PairE3GNNParallel::comm_preprocess() {
assert(!comm_preprocess_done);
CommBrick *comm_brick = dynamic_cast<CommBrick *>(comm);
// fake lammps communication call to preprocess index
// gives complete comm_index_pack, unpack_forward, and extra_graph_idx_map
comm_brick->forward_comm(this);
std::map<int, std::set<int>> already_met_map;
for (int comm_phase = 0; comm_phase < 6; comm_phase++) {
const int n = comm_index_pack_forward[comm_phase].size();
int sproc = this->sendproc[comm_phase];
if (already_met_map.count(sproc) == 0) {
already_met_map.insert({sproc, std::set<int>()});
}
// for unpack_reverse, Ignore duplicated index by 'already_met'
std::vector<long> &idx_map_forward = comm_index_pack_forward[comm_phase];
std::vector<long> &idx_map_reverse = comm_index_unpack_reverse[comm_phase];
std::set<int>& already_met = already_met_map[sproc];
// the last index of x_comm is used to trash unnecessary values
const int trash_index =
graph_size + static_cast<int>(extra_graph_idx_map.size()); //+ 1;
for (int i = 0; i < n; i++) {
const int idx = idx_map_forward[i];
if (idx < graph_size) {
if (already_met.count(idx) == 1) {
idx_map_reverse.push_back(trash_index);
} else {
idx_map_reverse.push_back(idx);
already_met.insert(idx);
}
} else {
idx_map_reverse.push_back(idx);
}
}
if (use_cuda_mpi) {
comm_index_pack_forward_tensor[comm_phase] = torch::from_blob(idx_map_forward.data(), idx_map_forward.size(), INTEGER_TYPE).to(device);
auto upmap = comm_index_unpack_forward[comm_phase];
comm_index_unpack_forward_tensor[comm_phase] = torch::from_blob(upmap.data(), upmap.size(), INTEGER_TYPE).to(device);
comm_index_unpack_reverse_tensor[comm_phase] = torch::from_blob(idx_map_reverse.data(), idx_map_reverse.size(), INTEGER_TYPE).to(device);
}
}
comm_preprocess_done = true;
}
// called from comm_brick if comm_preprocess_done is false
void PairE3GNNParallel::pack_forward_init(int n, int *list_send,
int comm_phase) {
std::vector<long> &idx_map = comm_index_pack_forward[comm_phase];
idx_map.reserve(n);
int i, j;
int nlocal = list->inum;
tagint *tag = atom->tag;
for (i = 0; i < n; i++) {
int list_i = list_send[i];
int graph_idx = tag_to_graph_idx_ptr[tag[list_i]];
if (graph_idx != -1) {
// known atom (local atom + ghost atom inside cutoff)
idx_map.push_back(graph_idx);
} else {
// unknown atom, these are not used in computation in this process
// instead, this process is used to hand over these atoms to other proecss
// hold them in continuous manner for flexible tensor operations later
if (extra_graph_idx_map.find(list_i) != extra_graph_idx_map.end()) {
idx_map.push_back(extra_graph_idx_map[list_i]);
} else {
// unknown atom at pack forward, ghost atom outside cutoff?
extra_graph_idx_map[i] = graph_size + extra_graph_idx_map.size();
idx_map.push_back(extra_graph_idx_map[i]); // same as list_i in pack
}
}
}
}
// called from comm_brick if comm_preprocess_done is false
void PairE3GNNParallel::unpack_forward_init(int n, int first, int comm_phase) {
std::vector<long> &idx_map = comm_index_unpack_forward[comm_phase];
idx_map.reserve(n);
int i, j, last;
last = first + n;
int nlocal = list->inum;
tagint *tag = atom->tag;
for (i = first; i < last; i++) {
int graph_idx = tag_to_graph_idx_ptr[tag[i]];
if (graph_idx != -1) {
idx_map.push_back(graph_idx);
} else {
extra_graph_idx_map[i] = graph_size + extra_graph_idx_map.size();
idx_map.push_back(extra_graph_idx_map[i]); // same as list_i in pack
}
}
}
int PairE3GNNParallel::pack_forward_comm_gnn(float *buf, int comm_phase) {
std::vector<long> &idx_map = comm_index_pack_forward[comm_phase];
const int n = static_cast<int>(idx_map.size());
if (use_cuda_mpi && n != 0) {
torch::Tensor &idx_map_tensor = comm_index_pack_forward_tensor[comm_phase];
auto selected = x_comm.index_select(0, idx_map_tensor); // its size is x_dim * n
cudaError_t cuda_err =
cudaMemcpy(buf, selected.data_ptr<float>(), (x_dim * n) * sizeof(float),
cudaMemcpyDeviceToDevice);
} else {
int i, j, m;
m = 0;
for (i = 0; i < n; i++) {
const int idx = static_cast<int>(idx_map.at(i));
float *from = x_comm[idx].data_ptr<float>();
for (j = 0; j < x_dim; j++) {
buf[m++] = from[j];
}
}
}
if (print_info) {
std::cout << world_rank << " comm_phase: " << comm_phase << std::endl;
std::cout << world_rank << " pack_forward x_dim: " << x_dim << std::endl;
std::cout << world_rank << " pack_forward n: " << n << std::endl;
std::cout << world_rank << " pack_forward x_dim*n: " << x_dim * n
<< std::endl;
double Msend = static_cast<double>(x_dim * n * 4) / (1024 * 1024);
std::cout << world_rank << " send size(MB): " << Msend << "\n" << std::endl;
}
return x_dim * n;
}
void PairE3GNNParallel::unpack_forward_comm_gnn(float *buf, int comm_phase) {
std::vector<long> &idx_map = comm_index_unpack_forward[comm_phase];
const int n = static_cast<int>(idx_map.size());
if (use_cuda_mpi && n != 0) {
torch::Tensor &idx_map_tensor = comm_index_unpack_forward_tensor[comm_phase];
auto buf_tensor =
torch::from_blob(buf, {n, x_dim}, FLOAT_TYPE.device(device));
x_comm.scatter_(0, idx_map_tensor.repeat_interleave(x_dim).view({n, x_dim}),
buf_tensor);
} else {
int i, j, m;
m = 0;
for (i = 0; i < n; i++) {
const int idx = static_cast<int>(idx_map.at(i));
float *to = x_comm[idx].data_ptr<float>();
for (j = 0; j < x_dim; j++) {
to[j] = buf[m++];
}
}
}
}
int PairE3GNNParallel::pack_reverse_comm_gnn(float *buf, int comm_phase) {
std::vector<long> &idx_map = comm_index_unpack_forward[comm_phase];
const int n = static_cast<int>(idx_map.size());
if (use_cuda_mpi && n != 0) {
torch::Tensor &idx_map_tensor = comm_index_unpack_forward_tensor[comm_phase];
auto selected = x_comm.index_select(0, idx_map_tensor);
cudaError_t cuda_err = cudaMemcpy(buf, selected.data_ptr<float>(), (x_dim * n) * sizeof(float), cudaMemcpyDeviceToDevice);
} else {
int i, j, m;
m = 0;
for (i = 0; i < n; i++) {
const int idx = static_cast<int>(idx_map.at(i));
float *from = x_comm[idx].data_ptr<float>();
for (j = 0; j < x_dim; j++) {
buf[m++] = from[j];
}
}
}
if (print_info) {
std::cout << world_rank << " comm_phase: " << comm_phase << std::endl;
std::cout << world_rank << " pack_reverse x_dim: " << x_dim << std::endl;
std::cout << world_rank << " pack_reverse n: " << n << std::endl;
std::cout << world_rank << " pack_reverse x_dim*n: " << x_dim * n
<< std::endl;
double Msend = static_cast<double>(x_dim * n * 4) / (1024 * 1024);
}
return x_dim * n;
}
void PairE3GNNParallel::unpack_reverse_comm_gnn(float *buf, int comm_phase) {
std::vector<long> &idx_map = comm_index_unpack_reverse[comm_phase];
const int n = static_cast<int>(idx_map.size());
if (use_cuda_mpi && n != 0) {
torch::Tensor &idx_map_tensor = comm_index_unpack_reverse_tensor[comm_phase];
auto buf_tensor =
torch::from_blob(buf, {n, x_dim}, FLOAT_TYPE.device(device));
x_comm.scatter_(0, idx_map_tensor.repeat_interleave(x_dim).view({n, x_dim}),
buf_tensor, "add");
} else {
int i, j, m;
m = 0;
for (i = 0; i < n; i++) {
const int idx = static_cast<int>(idx_map.at(i));
if (idx == -1) {
m += x_dim;
continue;
}
float *to = x_comm[idx].data_ptr<float>();
for (j = 0; j < x_dim; j++) {
to[j] += buf[m++];
}
}
}
}
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef PAIR_CLASS
PairStyle(e3gnn/parallel, PairE3GNNParallel)
#else
#ifndef LMP_PAIR_E3GNN_PARALLEL
#define LMP_PAIR_E3GNN_PARALLEL
#include "pair.h"
#include <torch/torch.h>
#include <vector>
namespace LAMMPS_NS {
class PairE3GNNParallel : public Pair {
private:
double cutoff;
double cutoff_square;
std::vector<torch::jit::Module> model_list;
torch::Device device = torch::kCPU;
torch::Device device_comm = torch::kCPU;
torch::Device get_cuda_device();
bool use_cuda_mpi;
// for communication
// Most of these variables for communication is temporary and valid for only
// one MD step.
int x_dim; // to determine per atom data size
int graph_size;
torch::Tensor x_comm; // x_local + x_ghost + x_comm_extra
void comm_preprocess();
bool comm_preprocess_done = false;
// temporary variables holds for each compute step
std::unordered_map<int, long> extra_graph_idx_map;
// To use scatter, store long instead of int
// array of vector
std::vector<long> comm_index_pack_forward[6];
std::vector<long> comm_index_unpack_forward[6];
std::vector<long> comm_index_unpack_reverse[6];
// its size is 6 and initialized at comm_preprocess()
torch::Tensor comm_index_pack_forward_tensor[6];
torch::Tensor comm_index_unpack_forward_tensor[6];
torch::Tensor comm_index_unpack_reverse_tensor[6];
// to use tag_to_graph_idx inside comm methods
int *tag_to_graph_idx_ptr = nullptr;
int sendproc[6];
int recvproc[6];
public:
PairE3GNNParallel(class LAMMPS *);
~PairE3GNNParallel();
// TODO: keep encapsulation..
void compute(int, int) override;
void settings(int, char **) override;
// read Atom type string from input script & related coeff
void coeff(int, char **) override;
void allocate();
void pack_forward_init(int n, int *list, int comm_phase);
void unpack_forward_init(int n, int first, int comm_phase);
int pack_forward_comm_gnn(float *buf, int comm_phase);
void unpack_forward_comm_gnn(float *buf, int comm_phase);
int pack_reverse_comm_gnn(float *buf, int comm_phase);
void unpack_reverse_comm_gnn(float *buf, int comm_phase);
void init_style() override;
double init_one(int, int) override;
int get_x_dim();
bool use_cuda_mpi_();
bool is_comm_preprocess_done();
void notify_proc_ids(const int *sendproc, const int *recvproc);
bool print_info = false;
int world_rank;
};
class DeviceBuffManager {
private:
DeviceBuffManager() {}
DeviceBuffManager(const DeviceBuffManager &);
DeviceBuffManager &operator=(const DeviceBuffManager &);
float *buf_send_device = nullptr;
float *buf_recv_device = nullptr;
int send_buf_size = 0;
int recv_buf_size = 0;
public:
static DeviceBuffManager &getInstance();
void get_buffer(int, int, float *&, float *&);
~DeviceBuffManager();
};
} // namespace LAMMPS_NS
#endif
#endif
#!/bin/bash
lammps_root=$1
cxx_standard=$2 # 14, 17
d3_support=$3 # 1, 0
SCRIPT_DIR=$(dirname "${BASH_SOURCE[0]}")
###########################################
# Check if the given arguments are valid #
###########################################
# Check the number of arguments
if [ "$#" -ne 3 ]; then
echo "Usage: sh patch_lammps.sh {lammps_root} {cxx_standard} {d3_support}"
echo " {lammps_root}: Root directory of LAMMPS source"
echo " {cxx_standard}: C++ standard (14, 17)"
echo " {d3_support}: Support for pair_d3 (1, 0)"
exit 1
fi
# Check if the lammps_root directory exists
if [ ! -d "$lammps_root" ]; then
echo "Error: No such directory: $lammps_root"
exit 1
fi
# Check if the given directory is the root of LAMMPS source
if [ ! -d "$lammps_root/cmake" ] && [ ! -d "$lammps_root/potentials" ]; then
echo "Error: Given $lammps_root is not a root of LAMMPS source"
exit 1
fi
# Check if the script is being run from the root of SevenNet
if [ ! -f "${SCRIPT_DIR}/pair_e3gnn.cpp" ]; then
echo "Error: Script executed in a wrong directory"
exit 1
fi
# Check if the patch is already applied
if [ -f "$lammps_root/src/pair_e3gnn.cpp" ]; then
echo "----------------------------------------------------------"
echo "Seems like given LAMMPS is already patched."
echo "Try again after removing src/pair_e3gnn.cpp to force patch"
echo "----------------------------------------------------------"
echo "Example build commands, under LAMMPS root"
echo " mkdir build; cd build"
echo " cmake ../cmake -DCMAKE_PREFIX_PATH=$(python -c 'import torch;print(torch.utils.cmake_prefix_path)')"
echo " make -j 4"
exit 0
fi
# Check if OpenMPI exists and if it is CUDA-aware
if command -v ompi_info &> /dev/null; then
cuda_support=$(ompi_info --parsable --all | grep mpi_built_with_cuda_support:value)
if [[ -z "$cuda_support" ]]; then
echo "OpenMPI not found, parallel performance is not optimal"
elif [[ "$cuda_support" == *"true" ]]; then
echo "OpenMPI is CUDA aware"
else
echo "This system's OpenMPI is not 'CUDA aware', parallel performance is not optimal"
fi
else
echo "OpenMPI not found, parallel performance is not optimal"
fi
# Extract LAMMPS version and update
lammps_version=$(grep "#define LAMMPS_VERSION" $lammps_root/src/version.h | awk '{print $3, $4, $5}' | tr -d '"')
# Combine version and update
detected_version="$lammps_version"
required_version="2 Aug 2023" # Example required version
# Check if the detected version is compatible
if [[ "$detected_version" != "$required_version" ]]; then
echo "Warning: Detected LAMMPS version ($detected_version) may not be compatible. Required version: $required_version"
fi
###########################################
# Backup original LAMMPS source code #
###########################################
# Create a backup directory if it doesn't exist
backup_dir="$lammps_root/_backups"
mkdir -p $backup_dir
# Copy comm_* from original LAMMPS source as backup
cp $lammps_root/src/comm_brick.cpp $backup_dir/
cp $lammps_root/src/comm_brick.h $backup_dir/
# Copy cmake/CMakeLists.txt from original source as backup
cp $lammps_root/cmake/CMakeLists.txt $backup_dir/CMakeLists.txt
###########################################
# Patch LAMMPS source code: e3gnn #
###########################################
# 1. Copy pair_e3gnn files to LAMMPS source
cp $SCRIPT_DIR/{pair_e3gnn,pair_e3gnn_parallel,comm_brick}.cpp $lammps_root/src/
cp $SCRIPT_DIR/{pair_e3gnn,pair_e3gnn_parallel,comm_brick}.h $lammps_root/src/
# 2. Patch cmake/CMakeLists.txt
sed -i "s/set(CMAKE_CXX_STANDARD 11)/set(CMAKE_CXX_STANDARD $cxx_standard)/" $lammps_root/cmake/CMakeLists.txt
cat >> $lammps_root/cmake/CMakeLists.txt << "EOF"
find_package(Torch REQUIRED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
target_link_libraries(lammps PUBLIC "${TORCH_LIBRARIES}")
EOF
###########################################
# Patch LAMMPS source code: d3 #
###########################################
if [ "$d3_support" -ne 0 ]; then
# 1. Copy pair_d3 files to LAMMPS source
cp $SCRIPT_DIR/pair_d3.cu $lammps_root/src/
cp $SCRIPT_DIR/pair_d3.h $lammps_root/src/
cp $SCRIPT_DIR/pair_d3_pars.h $lammps_root/src/
# 2. Patch cmake/CMakeLists.txt
sed -i "s/project(lammps CXX)/project(lammps CXX CUDA)/" $lammps_root/cmake/CMakeLists.txt
sed -i "s/\${LAMMPS_SOURCE_DIR}\/\[\^.\]\*\.cpp/\${LAMMPS_SOURCE_DIR}\/\[\^.\]\*\.cpp \${LAMMPS_SOURCE_DIR}\/\[\^.\]\*\.cu/" $lammps_root/cmake/CMakeLists.txt
cat >> $lammps_root/cmake/CMakeLists.txt << "EOF"
find_package(CUDA)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -fmad=false -O3")
string(REPLACE "-gencode arch=compute_50,code=sm_50" "" CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
target_link_libraries(lammps PUBLIC ${CUDA_LIBRARIES} cuda)
EOF
fi
###########################################
# Print changes and backup file locations #
###########################################
# Print changes and backup file locations
echo "Changes made:"
echo " - Original LAMMPS files (src/comm_brick.*, cmake/CMakeList.txt) are in {lammps_root}/_backups"
echo " - Copied contents of pair_e3gnn to $lammps_root/src/"
echo " - Patched CMakeLists.txt: include LibTorch, CXX_STANDARD $cxx_standard"
if [ "$d3_support" -ne 0 ]; then
echo " - Copied contents of pair_d3 to $lammps_root/src/"
echo " - Patched CMakeLists.txt: include CUDA"
fi
# Provide example cmake command to the user
echo "Example build commands, under LAMMPS root"
echo " mkdir build; cd build"
echo " cmake ../cmake -DCMAKE_PREFIX_PATH=$(python -c 'import torch;print(torch.utils.cmake_prefix_path)')"
echo " make -j 4"
exit 0
model:
chemical_species: 'univ' # Ready for 119 elements
cutoff: 5.0
channel: 128
is_parity: False
lmax: 2
num_convolution_layer: 5
irreps_manual:
- "128x0e"
- "128x0e+64x1e+32x2e"
- "128x0e+64x1e+32x2e"
- "128x0e+64x1e+32x2e"
- "128x0e+64x1e+32x2e"
- "128x0e"
weight_nn_hidden_neurons: [64, 64]
radial_basis:
radial_basis_name: 'bessel'
bessel_basis_num: 8
cutoff_function:
cutoff_function_name: 'XPLOR'
cutoff_on: 4.5
act_gate: {'e': 'silu', 'o': 'tanh'}
act_scalar: {'e': 'silu', 'o': 'tanh'}
conv_denominator: 'avg_num_neigh'
train_shift_scale: False
train_denominator: False
self_connection_type: 'linear'
# Following are used to specify which part of the model would utilize fidelity-dependent parameters for multi-fidelity training.
# For detailed architecture, please refer to https://arxiv.org/abs/2409.07947
# Parts using fidelity-dependent weights are indicated as `Modified linear` layers in Figure 1.
use_modal_node_embedding: False # If true, use modified linear layer in atom-type embedding layer.
use_modal_self_inter_intro: True # If true, use modified linear layers in self-interaction block before the convolution in the interaction blocks.
use_modal_self_inter_outro: True # If true, use modified linear layers in self-interaction block after the convolution in the interaction blocks.
use_modal_output_block: True # If true, use modified linear layer in the output block.
train:
train_shuffle: True
random_seed: 777
is_train_stress : True
epoch: 200
loss: 'Huber'
loss_param:
delta: 0.01
optimizer: 'adam'
optim_param:
lr: 0.01
scheduler: 'linearlr'
scheduler_param:
start_factor: 1.0
total_iters: 200
end_factor: 0.0001
force_loss_weight : 1.00
stress_loss_weight: 0.01
error_record:
- ['Energy', 'MAE']
- ['Force', 'MAE']
- ['Stress', 'MAE']
- ['Energy', 'Loss']
- ['Force', 'Loss']
- ['Stress', 'Loss']
- ['TotalLoss', 'None']
per_epoch: 10
use_modality: True
use_weight: True
data:
batch_size: 64
shift: 'elemwise_reference_energies'
scale: 1.73
use_modal_wise_shift: True # If true, use different atomic energy shift for each database
use_modal_wise_scale: False # If true, use different atomic energy scale for each database
load_trainset_path:
- data_modality: pbe # Name of database
file_list:
- file: "**path to PBE database**" # ASE readable or .pt file (graph.pt)
data_weight:
energy: 1.0
force: 1.0 # This weight would be additionally multiplied to `force_loss_weight` for this database
stress: 1.0 # This weight would be additionally multiplied to `stress_loss_weight` for this database
- data_modality: r2scan
file_list:
- file: "**path to r2SCAN database**"
data_weight:
energy: 7.0
force: 7.0
stress: 7.0
load_pbe_validset_path: # any name starts with 'load' and ends with 'set_path'
- data_modality: pbe # modality must be given for mm valid set
file_list:
- file: "**path to PBE test set**"
load_scan_validset_path:
- data_modality: r2scan
file_list:
- file: "**path to r2SCAN test set**"
# Example input.yaml for training SevenNet.
# '*' signifies default. You can check log.sevenn for defaults.
model:
chemical_species: 'Auto' # Elements model should know. [ 'Univ' | 'Auto' | manual_user_input ]
cutoff: 5.0 # Cutoff radius in Angstroms. If two atoms are within the cutoff, they are connected.
channel: 32 # The multiplicity(channel) of node features.
lmax: 2 # Maximum order of irreducible representations (rotation order).
num_convolution_layer: 3 # The number of message passing layers.
#irreps_manual: # Manually set irreps of the model in each layer
#- "128x0e"
#- "128x0e+64x1e+32x2e"
#- "128x0e+64x1e+32x2e"
#- "128x0e+64x1e+32x2e"
#- "128x0e+64x1e+32x2e"
#- "128x0e"
weight_nn_hidden_neurons: [64, 64] # Hidden neurons in convolution weight neural network
radial_basis: # Function and its parameters to encode radial distance
radial_basis_name: 'bessel' # Only 'bessel' is currently supported
bessel_basis_num: 8
cutoff_function: # Envelop function, multiplied to radial_basis functions to init edge features
cutoff_function_name: 'poly_cut' # {'poly_cut' and 'poly_cut_p_value'} or {'XPLOR' and 'cutoff_on'}
poly_cut_p_value: 6
act_gate: {'e': 'silu', 'o': 'tanh'} # Equivalent to 'nonlinearity_gates' in nequip
act_scalar: {'e': 'silu', 'o': 'tanh'} # Equivalent to 'nonlinearity_scalars' in nequip
is_parity: False # Pairy True (E(3) group) or False (to SE(3) group)
self_connection_type: 'nequip' # Default is 'nequip'. 'linear' is used for SevenNet-0. I recommend 'linear' for 'Univ' chemical_species
conv_denominator: "avg_num_neigh" # Valid options are "avg_num_neigh*", "sqrt_avg_num_neigh", or float
train_denominator: False # Enable training for denominator in convolution layer
train_shift_scale: False # Enable training for shift & scale in output layer
train:
random_seed: 1
is_train_stress: True # Includes stress in the loss function
epoch: 200 # Ends training after this number of epochs
#loss: 'Huber' # Default is 'mse' (mean squared error)
#loss_param:
#delta: 0.01
# Each optimizer and scheduler have different available parameters.
# You can refer to sevenn/train/optim.py for supporting optimizer & schedulers
optimizer: 'adam' # Options available are 'sgd', 'adagrad', 'adam', 'adamw', 'radam'
optim_param:
lr: 0.005
scheduler: 'exponentiallr' # 'steplr', 'multisteplr', 'exponentiallr', 'cosineannealinglr', 'reducelronplateau', 'linearlr'
scheduler_param:
gamma: 0.99
force_loss_weight: 0.1 # Coefficient for force loss
stress_loss_weight: 1e-06 # Coefficient for stress loss (to kbar unit)
per_epoch: 10 # Generate checkpoints every this epoch
# ['target y', 'metric']
# Target y: TotalEnergy, Energy, Force, Stress, Stress_GPa, TotalLoss
# Metric : RMSE, MAE, or Loss
error_record:
- ['Energy', 'RMSE']
- ['Force', 'RMSE']
- ['Stress', 'RMSE']
- ['TotalLoss', 'None']
# Continue training model from given checkpoint, or pre-trained model checkpoint for fine-tuning
#continue:
#checkpoint: 'checkpoint_best.pth' # Checkpoint of pre-trained model or a model want to continue training.
#reset_optimizer: False # Set True for fine-tuning
#reset_scheduler: False # Set True for fine-tuning
data:
batch_size: 4 # Per GPU batch size.
shift: 'per_atom_energy_mean' # One of 'per_atom_energy_mean*', 'elemwise_reference_energies', float
scale: 'force_rms' # One of 'force_rms*', 'per_atom_energy_std', float
# SevenNet automatically matches data format from its filename.
# For those not `structure_list` or `.pt` files, assumes it is ASE readable
# In this case, below arguments are directly passed to `ase.io.read`
data_format_args:
index: ':' # see `https://wiki.fysik.dtu.dk/ase/ase/io/io.html` for more valid arguments
# validset is needed if you want '_best.pth' during training. If not, both validset and testset is optional.
load_trainset_path: ['./train_*.extxyz'] # Example of using ase as data_format, support multiple files and expansion(*)
load_validset_path: ['./valid.extxyz']
load_testset_path: ['./sevenn_data/mydata.pt'] # Graph can be preprocessed using `sevenn_graph_build` and accessible like this
# Example input.yaml for fine-tuning sevennet-0
# '*' signifies default. You can check log.sevenn for defaults.
model: # model keys should be consistent except for train_* keys
chemical_species: 'Auto'
cutoff: 5.0
channel: 128
is_parity: False
lmax: 2
num_convolution_layer: 5
irreps_manual:
- "128x0e"
- "128x0e+64x1e+32x2e"
- "128x0e+64x1e+32x2e"
- "128x0e+64x1e+32x2e"
- "128x0e+64x1e+32x2e"
- "128x0e"
weight_nn_hidden_neurons: [64, 64]
radial_basis:
radial_basis_name: 'bessel'
bessel_basis_num: 8
cutoff_function:
cutoff_function_name: 'XPLOR'
cutoff_on: 4.5
self_connection_type: 'linear'
train_shift_scale: False # customizable (True | False)
train_denominator: False # customizable (True | False)
train: # Customizable
random_seed: 1
is_train_stress: True
epoch: 100
loss: 'Huber' # keeping original loss function give better ft result
loss_param:
delta: 0.01
optimizer: 'adam'
optim_param:
lr: 0.004
scheduler: 'exponentiallr'
scheduler_param:
gamma: 0.99
force_loss_weight: 1.0
stress_loss_weight: 0.01
per_epoch: 10 # Generate checkpoints every this epoch
# ['target y', 'metric']
# Target y: TotalEnergy, Energy, Force, Stress, Stress_GPa, TotalLoss
# Metric : RMSE, MAE, or Loss
error_record:
- ['Energy', 'RMSE']
- ['Force', 'RMSE']
- ['Stress', 'RMSE']
- ['TotalLoss', 'None']
continue:
reset_optimizer: True
reset_scheduler: True
reset_epoch: True
checkpoint: 'SevenNet-0_11July2024'
data: # Customizable
batch_size: 4
data_divide_ratio: 0.1
# SevenNet automatically matches data format from its filename.
# For those not `structure_list` or `.pt` files, assumes it is ASE readable
# In this case, below arguments are directly passed to `ase.io.read`
data_format_args:
index: ':' # see `https://wiki.fysik.dtu.dk/ase/ase/io/io.html` for more valid arguments
# validset is needed if you want '_best.pth' during training. If not, both validset and testset is optional.
load_trainset_path: ['./train_*.extxyz'] # Example of using ase as data_format, support multiple files and expansion(*)
load_validset_path: ['./valid.extxyz']
load_testset_path: ['./sevenn_data/mydata.pt'] # Graph can be preprocessed using `sevenn_graph_build` and accessible like this
# Application of 7net-0 on liquid electrolyte system via fine-tuning
# Paper: https://arxiv.org/abs/2501.05211
model:
# parameters of SevenNet-0, should not be changed
chemical_species: 'auto'
cutoff: 5.0
channel: 128
is_parity: False
lmax: 2
num_convolution_layer: 5
irreps_manual:
- "128x0e"
- "128x0e+64x1e+32x2e"
- "128x0e+64x1e+32x2e"
- "128x0e+64x1e+32x2e"
- "128x0e+64x1e+32x2e"
- "128x0e"
weight_nn_hidden_neurons: [64, 64]
radial_basis:
radial_basis_name: 'bessel'
bessel_basis_num: 8
cutoff_function:
cutoff_function_name: 'XPLOR'
cutoff_on: 4.5
act_gate: {'e': 'silu', 'o': 'tanh'}
act_scalar: {'e': 'silu', 'o': 'tanh'}
self_connection_type: 'linear'
# useful for fine-tuning
train_shift_scale: True
train_avg_num_neigh: True
train:
random_seed: 1
is_train_stress: True
epoch: 100 # we went through 100 epochs and chose checkpoint at 50 epoch where the error have reached plateau.
loss: 'Huber'
loss_param:
delta: 0.01
optimizer: 'adam'
optim_param:
lr: 0.0001
scheduler: 'linearlr'
scheduler_param:
start_factor: 1.0
total_iters: 600
end_factor: 0.000001
force_loss_weight: 1.00
stress_loss_weight: 1.00 # 7net-0 quantitatively lacked accuracy on pressure histograms compared to DFT, so we increased stress loss weight
error_record:
- ['Energy', 'RMSE']
- ['Force', 'RMSE']
- ['Stress', 'RMSE']
- ['Energy', 'MAE']
- ['Force', 'MAE']
- ['Stress', 'MAE']
- ['Energy', 'Loss']
- ['Force', 'Loss']
- ['Stress', 'Loss']
- ['TotalLoss', 'None']
per_epoch: 10 # Generate epoch every this number of times
continue:
use_statistic_values_of_checkpoint: True
checkpoint: '7net-0' # fine-tuning from 7net-0
reset_optimizer: True
reset_scheduler: True
data:
batch_size: 1 # our fine-tuning dataset had ~360 atoms per structure, so we used batch size of 1 to avoid GPU OOM error.
shift: 'elemwise_reference_energies'
scale: 1.858
data_format: 'ase'
data_divide_ratio: 0.05
load_dataset_path: ["./data/total.extxyz"]
model:
chemical_species: 'univ' # Ready for 119 elements
cutoff: 6.0
channel: 128
is_parity: False
lmax: 3
num_convolution_layer: 3
irreps_manual:
- "128x0e"
- "128x0e+64x1e+32x2e+16x3e"
- "128x0e+64x1e+32x2e+16x3e"
- "128x0e"
weight_nn_hidden_neurons: [64, 64]
radial_basis:
radial_basis_name: 'bessel'
bessel_basis_num: 8
cutoff_function:
cutoff_function_name: 'XPLOR'
cutoff_on: 5.5
act_gate: {'e': 'silu', 'o': 'tanh'}
act_scalar: {'e': 'silu', 'o': 'tanh'}
conv_denominator: 'avg_num_neigh'
train_shift_scale: True
train_denominator: False
self_connection_type: 'linear'
# Following are used to specify which part of the model would utilize fidelity-dependent parameters for multi-fidelity training.
# For detailed architecture, please refer to https://arxiv.org/abs/2409.07947
# Parts using fidelity-dependent weights are indicated as `Modified linear` layers in Figure 1.
use_modal_node_embedding: False # If true, use modified linear layer in atom-type embedding layer.
use_modal_self_inter_intro: True # If true, use modified linear layers in self-interaction block before the convolution in the interaction blocks.
use_modal_self_inter_outro: True # If true, use modified linear layers in self-interaction block after the convolution in the interaction blocks.
use_modal_output_block: True # If true, use modified linear layer in the output block.
train:
train_shuffle: True
random_seed: 777
is_train_stress : True
epoch: 200
loss: 'Huber'
loss_param:
delta: 0.01
optimizer: 'adam'
optim_param:
lr: 0.01
scheduler: 'linearlr'
scheduler_param:
start_factor: 1.0
total_iters: 200
end_factor: 0.0001
force_loss_weight : 1.00
stress_loss_weight: 0.01
error_record:
- ['Energy', 'MAE']
- ['Force', 'MAE']
- ['Stress', 'MAE']
- ['Energy', 'Loss']
- ['Force', 'Loss']
- ['Stress', 'Loss']
- ['TotalLoss', 'None']
per_epoch: 10
use_modality: True
use_weight: True
data:
batch_size: 16
shift: 'elemwise_reference_energies'
scale: 'force_rms'
use_modal_wise_shift: True # If true, use different atomic energy shift for each database
use_modal_wise_scale: False # If true, use different atomic energy scale for each database
load_trainset_path:
- data_modality: pbe # Name of database
file_list:
- file: "path to pbe dataset" # ASE readable or .pt file (graph.pt)
data_weight:
energy: 1.0
force: 0.1 # This weight would be additionally multiplied to `force_loss_weight` for this database
stress: 1.0 # This weight would be additionally multiplied to `stress_loss_weight` for this database
- data_modality: scan
file_list:
- file: "path to scan dataset"
data_weight:
energy: 1.0
force: 10.0
stress: 1.0
load_pbe_validset_path: # any name starts with 'load' and ends with 'set_path'
- data_modality: pbe # modality must be given for mm valid set
file_list:
- file: "path to pbe validset"
load_scan_validset_path:
- data_modality: scan
file_list:
- file: "path to scan validset"
# SevenNet-0, should be run with `sevenn -m train_v1` as it uses old routine
model:
chemical_species: 'auto'
cutoff: 5.0
channel: 128
is_parity: False
lmax: 2
num_convolution_layer: 5
irreps_manual:
- "128x0e"
- "128x0e+64x1e+32x2e"
- "128x0e+64x1e+32x2e"
- "128x0e+64x1e+32x2e"
- "128x0e+64x1e+32x2e"
- "128x0e"
weight_nn_hidden_neurons: [64, 64]
radial_basis:
radial_basis_name: 'bessel'
bessel_basis_num: 8
cutoff_function:
cutoff_function_name: 'XPLOR'
cutoff_on: 4.5
act_gate: {'e': 'silu', 'o': 'tanh'}
act_scalar: {'e': 'silu', 'o': 'tanh'}
conv_denominator: 'avg_num_neigh'
train_shift_scale: False
train_denominator: False
self_connection_type: 'linear'
train:
train_shuffle: False
random_seed: 1
is_train_stress : True
epoch: 600
loss: 'Huber'
loss_param:
delta: 0.01
optimizer: 'adam'
optim_param:
lr: 0.01
scheduler: 'linearlr'
scheduler_param:
start_factor: 1.0
total_iters: 600
end_factor: 0.0001
force_loss_weight : 1.00
stress_loss_weight: 0.01
error_record:
- ['Energy', 'RMSE']
- ['Force', 'RMSE']
- ['Stress', 'RMSE']
- ['Energy', 'MAE']
- ['Force', 'MAE']
- ['Stress', 'MAE']
- ['Energy', 'Loss']
- ['Force', 'Loss']
- ['Stress', 'Loss']
- ['TotalLoss', 'None']
per_epoch: 10
# continue:
# checkpoint: './checkpoint_last.pth'
# reset_optimizer: False
# reset_scheduler: False
data:
batch_size: 128 # per GPU batch size, as the model trained with 32 GPUs, the effective batch size equals 4096.
scale: 'per_atom_energy_std'
shift: 'elemwise_reference_energies'
data_format: 'ase'
save_by_train_valid: False
load_dataset_path: ["path_to_MPtrj_total.sevenn_data"]
load_validset_path: ["validaset.sevenn_data"]
model:
chemical_species: auto
cutoff: 5.0
irreps_manual:
- 128x0e
- 128x0e+64x1e+32x2e+32x3e
- 128x0e+64x1e+32x2e+32x3e
- 128x0e+64x1e+32x2e+32x3e
- 128x0e+64x1e+32x2e+32x3e
- 128x0e
channel: 128
lmax: 3
num_convolution_layer: 5
is_parity: false
radial_basis:
radial_basis_name: bessel
bessel_basis_num: 8
cutoff_function:
cutoff_function_name: poly_cut
poly_cut_p_value: 6
act_radial: silu
weight_nn_hidden_neurons:
- 64
- 64
act_scalar:
e: silu
o: tanh
act_gate:
e: silu
o: tanh
train_denominator: false
train_shift_scale: false
use_bias_in_linear: false
readout_as_fcn: false
self_connection_type: linear
interaction_type: nequip
train:
random_seed: 1
epoch: 600
loss: Huber
loss_param:
delta: 0.01
optimizer: adam
optim_param:
lr: 0.01
scheduler: linearlr
scheduler_param:
start_factor: 1.0
total_iters: 600
end_factor: 0.0001
force_loss_weight: 1.0
stress_loss_weight: 0.01
per_epoch: 10
is_train_stress: true
train_shuffle: true
error_record:
- - Energy
- MAE
- - Energy
- RMSE
- - Force
- MAE
- - Force
- RMSE
- - Stress
- MAE
- - Stress
- RMSE
- - Energy
- Loss
- - Force
- Loss
- - Stress
- Loss
- - TotalLoss
- None
best_metric: TotalLoss
data:
data_format: ase
data_format_args: {}
batch_size: 1024 # global batch size, should be divided by the number of GPUs
load_trainset_path: '**path_to_trainset**'
load_validset_path: '**path_to_validset**'
shift: 'elemwise_reference_energies'
scale: 'force_rms'
"""
Debt
keep old pre-trained checkpoints unchanged.
"""
import copy
import torch
import sevenn._keys as KEY
def version_tuple(v1):
v1 = tuple(map(int, v1.split('.')))
return v1
def patch_old_config(config):
version = config.get('version', None)
if not version:
raise ValueError('No version found in config')
major, minor, _ = version.split('.')[:3]
major, minor = int(major), int(minor)
if major == 0 and minor <= 9:
if config[KEY.CUTOFF_FUNCTION][KEY.CUTOFF_FUNCTION_NAME] == 'XPLOR':
config[KEY.CUTOFF_FUNCTION].pop('poly_cut_p_value', None)
if KEY.TRAIN_DENOMINTAOR not in config:
config[KEY.TRAIN_DENOMINTAOR] = config.pop('train_avg_num_neigh', False)
_opt = config.pop('optimize_by_reduce', None)
if _opt is False:
raise ValueError(
'This checkpoint(optimize_by_reduce: False) is no longer supported'
)
if KEY.CONV_DENOMINATOR not in config:
config[KEY.CONV_DENOMINATOR] = 0.0
if KEY._NORMALIZE_SPH not in config:
config[KEY._NORMALIZE_SPH] = False
return config
def map_old_model(old_model_state_dict):
"""
For compatibility with old namings (before 'correct' branch merged 2404XX)
Map old model's module names to new model's module names
"""
_old_module_name_mapping = {
'EdgeEmbedding': 'edge_embedding',
'reducing nn input to hidden': 'reduce_input_to_hidden',
'reducing nn hidden to energy': 'reduce_hidden_to_energy',
'rescale atomic energy': 'rescale_atomic_energy',
}
for i in range(10):
_old_module_name_mapping[f'{i} self connection intro'] = (
f'{i}_self_connection_intro'
)
_old_module_name_mapping[f'{i} convolution'] = f'{i}_convolution'
_old_module_name_mapping[f'{i} self interaction 2'] = (
f'{i}_self_interaction_2'
)
_old_module_name_mapping[f'{i} equivariant gate'] = f'{i}_equivariant_gate'
new_model_state_dict = {}
for k, v in old_model_state_dict.items():
key_name = k.split('.')[0]
follower = '.'.join(k.split('.')[1:])
if 'denumerator' in follower:
follower = follower.replace('denumerator', 'denominator')
if key_name in _old_module_name_mapping:
new_key_name = _old_module_name_mapping[key_name] + '.' + follower
new_model_state_dict[new_key_name] = v
else:
new_model_state_dict[k] = v
return new_model_state_dict
def sort_old_convolution(model_now, state_dict):
from e3nn.o3 import wigner_3j
"""
Reason1: we have to sort instructions of convolution to be compatible with
cuEquivariance. (therefore, sort weight)
Reason2: some of old convolution module's w3j coeff has flipped sign. This also
has to be fixed to be compatible with cuEquivarinace.
"""
def patch(stct):
inst_old = copy.copy(conv._instructions_before_sort)
inst_old = [(inst[0], inst[1], inst[2]) for inst in inst_old]
del conv._instructions_before_sort
conv_args = conv.convolution_kwargs
irreps_in1 = conv_args['irreps_in1']
irreps_in2 = conv_args['irreps_in2']
irreps_out = conv_args.get('irreps_out', conv_args.get('filter_irreps_out'))
inst_sorted = sorted(inst_old, key=lambda x: x[2])
inst_sorted = [
# in1, in2, out, weights
(inst[0], inst[1], inst[2], irreps_in1[inst[0]].mul)
for inst in inst_sorted
]
n = len(weight_nn.hs) - 2
ww_key = f'{conv_key}.weight_nn.layer{n}.weight'
ww = stct[ww_key]
ww_sorted = [None] * len(inst_old)
_prev_idx = 0
for ist_src in inst_old:
for j, ist_dst in enumerate(inst_sorted):
if not all(ist_src[ii] == ist_dst[ii] for ii in range(3)):
continue
numel = ist_dst[3] # weight num
ww_src = ww[:, _prev_idx : _prev_idx + numel]
l1, l2, l3 = (
irreps_in1[ist_src[0]].ir.l,
irreps_in2[ist_src[1]].ir.l,
irreps_out[ist_src[2]].ir.l,
)
if l1 > 0 and l2 > 0 and l3 > 0:
w3j_key = f'_w3j_{l1}_{l2}_{l3}'
conv_w3j_key = (
f'{conv_key}.convolution._compiled_main_left_right.{w3j_key}'
)
w3j_old = stct[conv_w3j_key]
w3j_now = wigner_3j(l1, l2, l3)
if not torch.allclose(w3j_old.to(w3j_now.device), w3j_now):
assert torch.allclose(
w3j_old.to(w3j_now.device), -1 * w3j_now
)
ww_src = -1 * ww_src
stct[conv_w3j_key] *= -1 # stct updated
_prev_idx += numel
ww_sorted[j] = ww_src
ww_sorted = torch.cat(ww_sorted, dim=1) # type: ignore
stct[ww_key] = ww_sorted.clone() # stct updated
conv_dicts = {}
for k, v in state_dict.items():
key_name = k.split('.')[0]
if key_name.split('_')[1] == 'convolution':
if key_name not in conv_dicts:
conv_dicts[key_name] = {}
conv_dicts[key_name].update({k: v})
new_state_dict = {}
new_state_dict.update(state_dict)
for conv_key, conv_state_dict in conv_dicts.items():
conv = model_now._modules[conv_key]
weight_nn = conv.weight_nn
patch(conv_state_dict)
new_state_dict.update(conv_state_dict)
return new_state_dict
def patch_state_dict_if_old(state_dict, config_cp, now_model):
version = config_cp.get('version', None)
if not version:
raise ValueError('No version found in config')
vs = version.split('.')
vsuffix = ''
if len(vs) == 4:
vsuffix = vs[-1]
vs = version_tuple('.'.join(vs[:3]))
else:
vs = version_tuple('.'.join(vs))
if vs < version_tuple('0.10.0'):
state_dict = map_old_model(state_dict)
# TODO: change version criteria before release!!!
# it causes problem if model is sorted but this function is called
# ... more robust way? idk
if vs < version_tuple('0.11.0') or (
vs == version_tuple('0.11.0') and vsuffix == 'dev0'
):
state_dict = sort_old_convolution(now_model, state_dict)
return state_dict
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment