proj_dm.py

import numpy as np
from pyscf import gto
import os
import sys
import argparse
import mendeleev
from calc_eig import calc_eig


# aa = 2.0**np.arange(6,-3,-1)
aa = 1.5**np.array([17,13,10,7,5,3,2,1,0,-1,-2,-3])
bb = np.diag(np.ones(aa.size)) - np.diag(np.ones(aa.size-1), k=1)
SHELL = [aa.size] * 3
coef = np.concatenate([aa.reshape(-1,1), bb], axis=1)
BASIS = [[0, *coef.tolist()], [1, *coef.tolist()], [2, *coef.tolist()]]


def parse_xyz(filename, basis='ccpvtz', verbose=False):
    with open(filename) as fp:
        natoms = int(fp.readline())
        comments = fp.readline()
        xyz_str = "".join(fp.readlines())
    mol = gto.Mole()
    mol.verbose = 4 if verbose else 0
    mol.atom = xyz_str
    mol.basis  = basis
    try:
        mol.build(0,0,unit="Ang")
    except RuntimeError as e:
        mol.spin = 1
        mol.build(0,0,unit="Ang")
    return mol  


def gen_proj(mol, intor = 'ovlp', verbose = False) :
    natm = mol.natm
    mole_coords = mol.atom_coords(unit="Ang")
    test_mol = gto.Mole()
    if verbose :
        test_mol.verbose = 4
    else :
        test_mol.verbose = 0
    test_mol.atom = [["Ne", coord] for coord in mole_coords]
    test_mol.basis = BASIS
    test_mol.spin = 0
    test_mol.build(0,0,unit="Ang")
    proj = gto.intor_cross(f'int1e_{intor}_sph', mol, test_mol) 
    
    def proj_func(mo):
        proj_coeff = np.matmul(mo, proj).reshape(*mo.shape[:2], natm, -1)
        if verbose:
            print('shape of coeff data          ', proj_coeff.shape)
        # res : nframe x nocc/nvir x natm x nproj
        return proj_coeff, proj_coeff.shape[-1]
    
    return proj_func


def proj_frame(xyz_file, mo_dir, dump_dir=None, basis='ccpvtz', ename="e_hf", intor='ovlp', verbose=False):
    mol = parse_xyz(xyz_file, basis=basis)
    meta, ehf, e_occ, c_occ = load_data(mo_dir, ename)
    
    proj_func = gen_proj(mol, intor, verbose)
    c_proj_occ,nproj = proj_func(c_occ)
    c_occ = c_proj_occ
    meta = np.append(meta, nproj)
    # print(meta, c_proj_occ.shape)

    if dump_dir is not None:
        dump_data(dump_dir, meta, ehf, e_occ, c_occ)
    return meta, ehf, e_occ, c_occ


def load_data(dir_name, ename="e_hf"):
    meta = np.loadtxt(os.path.join(dir_name, 'system.raw'), dtype=int).reshape(-1)
    natm = meta[0]
    nao = meta[1]
    nocc = meta[2]
    nvir = meta[3]
    ehf = np.loadtxt(os.path.join(dir_name, f'{ename}.raw')).reshape(-1, 1)
    e_occ = np.loadtxt(os.path.join(dir_name, 'ener_occ.raw')).reshape(-1, nocc)
    c_occ = np.loadtxt(os.path.join(dir_name, 'coeff_occ.raw')).reshape([-1, nocc, nao])
    return meta, ehf, e_occ, c_occ


def dump_data(dir_name, meta, ehf, e_occ, c_occ, dm_dict={}) :
    os.makedirs(dir_name, exist_ok = True)
    np.savetxt(os.path.join(dir_name, 'system.raw'), 
               meta.reshape(1,-1), 
               fmt = '%d',
               header = 'natm nao nocc nvir nproj')
    nframe = e_occ.shape[0]
    natm = meta[0]
    nao = meta[1]
    nocc = meta[2]
    nvir = meta[3]
    nproj = meta[4]
    # ntest == natm
    assert(all(c_occ.shape == np.array([nframe, nocc, natm, nproj], dtype=int)))
    assert(all(e_occ.shape == np.array([nframe, nocc], dtype=int)))
    assert(all(all(dm.shape == np.array([nframe, natm, nproj], dtype=int)) for dm in dm_dict.values()))
    np.save(os.path.join(dir_name, 'e_hf.npy'), ehf) 
    np.save(os.path.join(dir_name, 'ener_occ.npy'), e_occ)
    np.save(os.path.join(dir_name, 'coeff_occ.npy'), c_occ)
    for name, dm in dm_dict.items():
        np.save(os.path.join(dir_name, f'{name}.npy'), dm)


def main(xyz_files, mo_dirs, dump_dir, basis='ccpvtz', ename="e_hf", eig_names=['dm_eig', 'od_eig', 'se_eig', 'fe_eig'], intor='ovlp', verbose='False'):
    assert len(xyz_files) == len(mo_dirs)
    oldmeta = None
    all_e_hf = []
    all_e_occ = []
    all_c_occ = []
    all_dm_dict = {name:[] for name in eig_names}
    
    for xf, md in zip(xyz_files, mo_dirs):
        meta, e_hf, e_occ, c_occ = proj_frame(xf, md, basis=basis, ename=ename, intor=intor, verbose=verbose)
        if oldmeta is not None:
            assert all(oldmeta == meta), "all frames has to be in the same system thus meta has to be equal!"
        oldmeta = meta
        all_e_hf.append(e_hf)
        all_e_occ.append(e_occ)
        all_c_occ.append(c_occ)
        for name, dm_list in all_dm_dict.items():
            dm_list.append(2 * calc_eig(name, c_occ, e_occ, xf, shell=SHELL)) # multiply by 2 for restricted method, doubly occupied orbitals
        print(f"{xf} && {md} finished")

    all_e_hf = np.concatenate(all_e_hf)
    all_e_occ = np.concatenate(all_e_occ)
    all_c_occ = np.concatenate(all_c_occ)
    for name in all_dm_dict.keys():
        all_dm_dict[name] = np.concatenate(all_dm_dict[name])

    dump_data(dump_dir, meta, all_e_hf, all_e_occ, all_c_occ, all_dm_dict)
    print("done")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="project mo_coeffs into atomic basis and calculate descriptors.")
    parser.add_argument("-x", "--xyz-file", nargs="+", help="input xyz file(s), if more than one, concat them")
    parser.add_argument("-f", "--mo-dir", nargs="+", help="input mo folder(s), must of same number with xyz files")
    parser.add_argument("-d", "--dump-dir", default=".", help="dir of dumped files, if not specified, use current folder")
    parser.add_argument("-v", "--verbose", action='store_true', help="output calculation information")
    parser.add_argument("-I", "--intor", default="ovlp", help="intor string used to calculate int1e")
    parser.add_argument("-B", "--basis", default="ccpvtz", type=str, help="basis used to do the calculation")
    parser.add_argument("-e", "--ename", default="e_hf", help="file name for total energy")
    parser.add_argument("-E", "--eig-name", nargs="*", default=['dm_eig', 'od_eig', 'se_eig', 'fe_eig'], 
                        help="name of eigen values to be calculated and dumped")
    args = parser.parse_args()
    
    main(args.xyz_file, args.mo_dir, args.dump_dir, args.basis,
         args.ename, args.eig_name, args.intor, args.verbose)