Initial commit

47af8be9 · yuhai · 47af8be9 · 47af8be9 · 47af8be9 · 47af8be9
Commit 47af8be9 authored Jun 26, 2023 by yuhai
20 changed files
--- a/.github/workflows/mirror_gitee.yml
+++ b/.github/workflows/mirror_gitee.yml
+name: Mirror to Gitee Repo
+on: [ push, delete, create ]
+# Ensures that only one mirror task will run at a time.
+concurrency:
+  group: git-mirror
+jobs:
+  git-mirror:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: wearerequired/git-mirror-action@v1
+        env:
+          ORGANIZATION: deepmodeling
+          SSH_PRIVATE_KEY: ${{ secrets.SYNC_GITEE_PRIVATE_KEY }}
+        with:
+          source-repo: "https://github.com/deepmodeling/deepks-kit.git"
+          destination-repo: "git@gitee.com:deepmodeling/deepks-kit.git"
--- a/.gitignore
+++ b/.gitignore
+# User defined
+*~
+checkpoint
+model.ckpt.*
+.vscode
+.ipynb_*
+*.swp
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+_version.py
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
\ No newline at end of file
--- a/LICENSE
+++ b/LICENSE
+                   GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+  0. Additional Definitions.
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+  1. Exception to Section 3 of the GNU GPL.
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+  2. Conveying Modified Versions.
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+  3. Object Code Incorporating Material from Library Header Files.
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+  4. Combined Works.
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+   d) Do one of the following:
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+  5. Combined Libraries.
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+  6. Revised Versions of the GNU Lesser General Public License.
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
--- a/README.md
+++ b/README.md
+# DeePKS-kit
+DeePKS-kit is a program to generate accurate energy functionals for quantum chemistry systems,
+for both perturbative scheme (DeePHF) and self-consistent scheme (DeePKS).
+The program provides a command line interface `deepks` that contains five sub-commands, 
+- `train`: train an neural network based post-HF energy functional model
+- `test`: test the post-HF model with given data and show statistics
+- `scf`: run self-consistent field calculation with given energy model
+- `stats`: collect and print statistics of the SCF the results
+- `iterate`: iteratively train an self-consistent model by combining four commands above
+## Installation
+DeePKS-kit is a pure python library so it can be installed following the standard `git clone` then `pip install` procedure. Note that the two main requirements `pytorch` and `pyscf` will not be installed automatically so you will need to install them manually in advance. Below is a more detailed instruction that includes installing the required libraries in the environment.
+We use `conda` here as an example. So first you may need to install [Anaconda](https://docs.anaconda.com/anaconda/install/) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html).
+To reduce the possibility of library conflicts, we suggest create a new environment (named `deepks`) with basic dependencies installed (optional):
+```bash
+conda create -n deepks numpy scipy h5py ruamel.yaml paramiko
+conda activate deepks
+```
+Now you are in the new environment called `deepks`.
+Next, install [PyTorch](https://pytorch.org/get-started/locally/) 
+```bash
+# assuming a GPU with cudatoolkit 10.2 support
+conda install pytorch cudatoolkit=10.2 -c pytorch
+```
+and [PySCF](https://github.com/pyscf/pyscf).
+```bash
+# the conda package does not support python >= 3.8 so we use pip
+pip install pyscf
+```
+Once the environment has been setup properly, using pip to install DeePKS-kit:
+```bash
+pip install git+https://github.com/deepmodeling/deepks-kit/
+```
+## Usage
+An relatively detailed decrisption of the `deepks-kit` library can be found in [here](https://arxiv.org/pdf/2012.14615.pdf). Please also refer to the reference for the description of methods.
+Please see [`examples`](./examples) folder for the usage of `deepks-kit` library. A detailed example with executable data for single water molecules can be found [here](./examples/water_single). A more complicated one for training water clusters can be found [here](./examples/water_cluster).
+Check [this input file](./examples/water_cluster/args.yaml) for detailed explanation for possible input parameters, and also [this one](./examples/water_cluster/shell.yaml) if you would like to run on local machine instead of using Slurm scheduler.
+## References
+[1] Chen, Y., Zhang, L., Wang, H. and E, W., 2020. Ground State Energy Functional with Hartree–Fock Efficiency and Chemical Accuracy. The Journal of Physical Chemistry A, 124(35), pp.7155-7165.
+[2] Chen, Y., Zhang, L., Wang, H. and E, W., 2021. DeePKS: A Comprehensive Data-Driven Approach toward Chemically Accurate Density Functional Theory. Journal of Chemical Theory and Computation, 17(1), pp.170–181.
+<!-- ## TODO
+- [ ] Print loss separately for E and F in training.
+- [ ] Rewrite all `print` function using `logging`.
+- [ ] Write a detailed README and more docs.
+- [ ] Add unit tests. -->
--- a/deepks/__init__.py
+++ b/deepks/__init__.py
+__author__ = "Yixiao Chen"
+try:
+    from ._version import version as __version__
+except ImportError:
+    __version__ = 'unkown'
+__all__ = [
+    "iterate",
+    "model",
+    "scf",
+    "task",
+    # "tools" # collection of command line scripts, should not be imported by user
+]
+def __getattr__(name):
+    from importlib import import_module
+    if name in __all__:
+        return import_module("." + name, __name__)
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
--- a/deepks/__main__.py
+++ b/deepks/__main__.py
+import os
+import sys
+try:
+    import deepks
+except ImportError as e:
+    sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/../")
+from deepks.main import main_cli
+if __name__ == "__main__":
+    main_cli()
\ No newline at end of file
--- a/deepks/iterate/__init__.py
+++ b/deepks/iterate/__init__.py
+__all__ = [
+    "iterate",
+    "template",
+]
+from .iterate import make_scf, make_train, make_iterate
\ No newline at end of file
--- a/deepks/iterate/__main__.py
+++ b/deepks/iterate/__main__.py
+import os
+import sys
+try:
+    import deepks
+except ImportError as e:
+    sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/../")
+from deepks.main import iter_cli
+if __name__ == "__main__":
+    iter_cli()
\ No newline at end of file
--- a/deepks/iterate/iterate.py
+++ b/deepks/iterate/iterate.py
+import os
+import sys
+import numpy as np
+try:
+    import deepks
+except ImportError as e:
+    sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/../../")
+from deepks.utils import copy_file, link_file
+from deepks.utils import load_yaml, save_yaml
+from deepks.utils import load_sys_paths
+from deepks.utils import load_basis, save_basis
+from deepks.task.workflow import Sequence, Iteration
+from deepks.iterate.template import make_scf, make_train
+# args not specified here may cause error
+DEFAULT_SCF_MACHINE = {
+    "sub_size": 1, # how many systems is put in one task (folder)
+    "sub_res": None, # the resources for sub step when ingroup_parallel > 1
+    "group_size": 1, # how many tasks are submitted in one job
+    "ingroup_parallel": 1, #how many tasks can run at same time in one job
+    "dispatcher": None, # use default lazy-local slurm defined in task.py
+    "resources": None, # use default 10 core defined in templete.py
+    "python": "python" # use current python in path
+}
+# args not specified here may cause error
+DEFAULT_TRN_MACHINE = {
+    "dispatcher": None, # use default lazy-local slurm defined in task.py
+    "resources": None, # use default 10 core defined in templete.py
+    "python": "python" # use current python in path
+}
+SCF_ARGS_NAME = "scf_input.yaml"
+TRN_ARGS_NAME = "train_input.yaml"
+INIT_SCF_NAME = "init_scf.yaml"
+INIT_TRN_NAME = "init_train.yaml"
+DATA_TRAIN = "data_train"
+DATA_TEST  = "data_test"
+MODEL_FILE = "model.pth"
+PROJ_BASIS = "proj_basis.npz"
+SCF_STEP_DIR = "00.scf"
+TRN_STEP_DIR = "01.train"
+RECORD = "RECORD"
+SYS_TRAIN = "systems_train"
+SYS_TEST = "systems_test"
+DEFAULT_TRAIN = "systems_train.raw"
+DEFAULT_TEST = "systems_test.raw"
+def assert_exist(path):
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"No required file or directory: {path}")
+def check_share_folder(data, name, share_folder="share"):
+    # save data to share_folder/name. 
+    # if data is None or False, do nothing, return None
+    # otherwise, return name, and do one of the following:
+    #   if data is True, check the existence in share.
+    #   if data is a file name, copy it to share.
+    #   if data is a dict, save it as an yaml file in share.
+    #   otherwise, throw an error
+    if not data:
+        return None
+    dst_name = os.path.join(share_folder, name)
+    if data is True:
+        assert_exist(dst_name)
+        return name
+    elif isinstance(data, str) and os.path.exists(data):
+        copy_file(data, dst_name)
+        return name
+    elif isinstance(data, dict):
+        save_yaml(data, dst_name)
+        return name
+    else:
+        raise ValueError(f"Invalid argument: {data}")
+def check_arg_dict(data, default, strict=True):
+    if data is None:
+        data = {}
+    if isinstance(data, str):
+        data = load_yaml(data)
+    allowed = {k:v for k,v in data.items() if k in default}
+    outside = {k:v for k,v in data.items() if k not in default}
+    if outside:
+        print(f"following ars are not in the default list: {list(outside.keys())}"
+              +"and would be discarded" if strict else "but kept", file=sys.stderr)
+    if strict:
+        return {**default, **allowed}
+    else:
+        return {**default, **data}
+def collect_systems(systems, folder=None):
+    # check all systems have different basename
+    # if there's duplicate, concat its dirname into the basename sep by a "."
+    # then collect all systems into `folder` by symlink
+    sys_list = [os.path.abspath(s) for s in load_sys_paths(systems)]
+    parents, bases = map(list, zip(*[os.path.split(s.rstrip(os.path.sep)) 
+                                        for s in sys_list]))
+    dups = range(len(sys_list))
+    while True:
+        count_dict = {bases[i]:[] for i in dups}
+        for i in dups:
+            count_dict[bases[i]].append(i)
+        dup_dict = {k:v for k,v in count_dict.items() if len(v)>1}
+        if not dup_dict:
+            break
+        dups = sum(dup_dict.values(), [])
+        if all(parents[i] in ("/", "") for i in dups):
+            print("System list have duplicated terms, index:", dups, file=sys.stderr)
+            break
+        for di in dups:
+            if parents[di] in ("/", ""):
+                continue
+            newp, newb = os.path.split(parents[di])
+            parents[di] = newp
+            bases[di] = f"{newb}.{bases[di]}"
+    if folder is None:
+        return bases
+    targets = [os.path.join(folder, b) for b in bases]
+    for s, t in zip(sys_list, targets):
+        link_file(s, t, use_abs=True)
+    return targets
+def make_iterate(systems_train=None, systems_test=None, n_iter=0, 
+                 *, proj_basis=None, workdir=".", share_folder="share",
+                 scf_input=True, scf_machine=None,
+                 train_input=True, train_machine=None,
+                 init_model=False, init_scf=True, init_train=True,
+                 init_scf_machine=None, init_train_machine=None,
+                 cleanup=False, strict=True):
+    r"""
+    Make a `Workflow` to do the iterative training procedure.
+    The procedure will be conducted in `workdir` for `n_iter` iterations.
+    Each iteration of the procedure is done in sub-folder ``iter.XX``, 
+    which further containes two sub-folders, ``00.scf`` and ``01.train``.
+    The `Workflow` is only created but not executed.
+    Parameters
+    ----------
+    systems_train: str or list of str, optional
+        System paths used as training set in the procedure. These paths 
+        can refer to systems or a file that contains multiple system paths.
+        Systems must be .xyz files or folders contains .npy files.
+        If not given, use ``$share_folder/systems_train.raw`` as default.
+    systems_test: str or list of str, optional
+        System paths used as testing (or validation) set in the procedure. 
+        The format is same as `systems_train`. If not given, use the last
+        system in the training set as testing system.
+    n_iter: int, optional
+        The number of iterations to do. Default is 0.
+    proj_basis: str, optional
+        The basis set used to project the density matrix onto. 
+        Can be a `.npz` file specifying the coefficients in pyscf's format.
+        If not given, use the default basis.
+    workdir: str, optional
+        The working directory. Default is current directory (`.`).
+    share_folder: str, optional
+        The folder to store shared files in the iteration, including
+        ``scf_input.yaml``, ``train_input.yaml``, and possibly files for
+        initialization. Default is ``share``.
+    scf_input: bool or str or dict, optional
+        Arguments used to specify the SCF calculation. If given `None` or
+        `False`, bypass the checking and use program default (unreliable). 
+        Otherwise, the arguments would be saved as a YAML file at 
+        ``$share_folder/scf_input.yaml`` and used for SCF calculation. 
+        Default is `True`, which will check and use the existing file.
+        If given a string of file path, copy the corresponding file into 
+        target location. If given a dict, dump it into the target file.
+    scf_machine: str or dict, optional
+        Arguments used to specify the job settings of SCF calculation,
+        including submitting method, resources, group size, etc..
+        If given a string of file path, load that file as a dict using 
+        YAML format. If not given, using program default setup.
+    train_input: bool or str or dict, optional 
+        Arguments used to specify the training of neural network. 
+        It follows the same rule as `scf_input`, only that the target 
+        location is ``$share_folder/train_input.yaml``.
+    train_machine: str or dict, optional 
+        Arguments used to specify the job settings of NN training. 
+        It Follows the same rule as `scf_machine`, but without group.
+    init_model: bool or str, optional 
+        Decide whether to use an existing model as the starting point.
+        If set to `False` (default), use `init_scf` and `init_train` 
+        to run an extra initialization iteration in folder ``iter.init``. 
+        If set to `True`, look for a model at ``$share_folder/init/model.pth``.
+        If given a string of path, copy that file into target location.
+    init_scf: bool or str or dict, optional 
+        Similar to `scf_input` but used for init calculation. The target
+        location is ``$share_folder/init_scf.yaml``. Defaults to True.
+    init_scf_machine: str or dict, optional
+        If specified, use different machine settings for init scf jobs.
+    init_train: bool or str or dict, optional 
+        Similar to `train_input` but used for init calculation. The target
+        location is ``$share_folder/init_train.yaml``. Defaults to True.
+    init_train_machine: str or dict, optional
+        If specified, use different machine settings for init training job.
+    cleanup: bool, optional 
+        Whether to remove job files during calculation, 
+        such as ``slurm-*.out`` and ``err``. Defaults to False.
+    strict: bool, optional 
+        Whether to allow additional arguments to be passed to task constructor,
+        through `scf_machine` and `train_machine`. Defaults to True.
+    Returns
+    -------
+    iterate: Iteration (subclass of Workflow)
+        An instance of workflow that can be executed by `iterate.run()`.
+    Raises
+    ------
+    FileNotFoundError
+        Raise an Error when the system or argument files are required but 
+        not found in the share folder.
+    """
+    # check share folder contains required data
+    # and collect the systems into share folder
+    if systems_train is None: # load default training systems
+        default_train = os.path.join(share_folder, DEFAULT_TRAIN)
+        assert_exist(default_train) # must have training systems.
+        systems_train = default_train
+    systems_train = collect_systems(systems_train, os.path.join(share_folder, SYS_TRAIN))
+    # check test systems 
+    if systems_test is None: # try to load default testing systems
+        default_test = os.path.join(share_folder, DEFAULT_TEST)
+        if os.path.exists(default_test): # if exists then use it
+            systems_test = default_test
+        else: # if empty use last one of training system
+            systems_test = systems_train[-1]
+    systems_test = collect_systems(systems_test, os.path.join(share_folder, SYS_TEST))
+    # check share folder contains required yaml file
+    scf_args_name = check_share_folder(scf_input, SCF_ARGS_NAME, share_folder)
+    train_args_name = check_share_folder(train_input, TRN_ARGS_NAME, share_folder)
+    # check required machine parameters
+    scf_machine = check_arg_dict(scf_machine, DEFAULT_SCF_MACHINE, strict)
+    train_machine = check_arg_dict(train_machine, DEFAULT_TRN_MACHINE, strict)
+    # handle projection basis
+    if proj_basis is not None:
+        save_basis(os.path.join(share_folder, PROJ_BASIS), load_basis(proj_basis))
+        proj_basis = PROJ_BASIS
+    # make tasks
+    scf_step = make_scf(
+        systems_train=systems_train, systems_test=systems_test,
+        train_dump=DATA_TRAIN, test_dump=DATA_TEST, no_model=False,
+        workdir=SCF_STEP_DIR, share_folder=share_folder,
+        source_arg=scf_args_name, source_model=MODEL_FILE,
+        source_pbasis=proj_basis, cleanup=cleanup, **scf_machine
+    )
+    train_step = make_train(
+        source_train=DATA_TRAIN, source_test=DATA_TEST,
+        restart=True, source_model=MODEL_FILE, save_model=MODEL_FILE, 
+        source_pbasis=proj_basis, source_arg=train_args_name, 
+        workdir=TRN_STEP_DIR, share_folder=share_folder,
+        cleanup=cleanup, **train_machine
+    )
+    per_iter = Sequence([scf_step, train_step])
+    iterate = Iteration(per_iter, n_iter, 
+                        workdir=".", record_file=os.path.join(workdir, RECORD))
+    # make init
+    if init_model: # if set true or give str, check share/init/model.pth
+        init_folder=os.path.join(share_folder, "init")
+        check_share_folder(init_model, MODEL_FILE, init_folder)
+        iterate.set_init_folder(init_folder)
+    elif init_scf or init_train: # otherwise, make an init iteration to train the first model
+        init_scf_name = check_share_folder(init_scf, INIT_SCF_NAME, share_folder)
+        init_train_name = check_share_folder(init_train, INIT_TRN_NAME, share_folder)
+        init_scf_machine = (check_arg_dict(init_scf_machine, DEFAULT_SCF_MACHINE, strict)
+            if init_scf_machine is not None else scf_machine)
+        init_train_machine = (check_arg_dict(init_train_machine, DEFAULT_SCF_MACHINE, strict)
+            if init_train_machine is not None else train_machine)
+        scf_init = make_scf(
+        systems_train=systems_train, systems_test=systems_test,
+        train_dump=DATA_TRAIN, test_dump=DATA_TEST, no_model=True,
+        workdir=SCF_STEP_DIR, share_folder=share_folder,
+        source_arg=init_scf_name, source_model=None, source_pbasis=proj_basis,
+        cleanup=cleanup, **scf_machine
+        )
+        train_init = make_train(
+            source_train=DATA_TRAIN, source_test=DATA_TEST,
+            restart=False, source_model=MODEL_FILE, save_model=MODEL_FILE, 
+            source_pbasis=proj_basis, source_arg=init_train_name, 
+            workdir=TRN_STEP_DIR, share_folder=share_folder,
+            cleanup=cleanup, **train_machine
+        )
+        init_iter = Sequence([scf_init, train_init], workdir="iter.init")
+        iterate.prepend(init_iter)
+    return iterate
+def main(*args, **kwargs):
+    r"""
+    Make a `Workflow` to do the iterative training procedure and run it.
+    The parameters are the same as `make_iterate` but the jobs wil be run.
+    If ``$workdir/RECORD`` exists, the procedure will try to restart.
+    The procedure will be conducted in `workdir` for `n_iter` iterations.
+    Each iteration of the procedure is done in sub-folder ``iter.XX``, 
+    which further containes two sub-folders, ``00.scf`` and ``01.train``.
+    See `make_iterate` for detailed parameters.
+    """
+    # pass all arguments to make_iterate and run it
+    iterate = make_iterate(*args, **kwargs)
+    if os.path.exists(iterate.record_file):
+        iterate.restart()
+    else:
+        iterate.run()
+if __name__ == "__main__":
+    from deepks.main import iter_cli as cli
+    cli()
\ No newline at end of file
--- a/deepks/iterate/template.py
+++ b/deepks/iterate/template.py
+import os
+import sys
+import numpy as np
+from glob import glob
+from deepks.utils import check_list
+from deepks.utils import flat_file_list
+from deepks.utils import get_sys_name, load_sys_paths
+from deepks.task.task import PythonTask, ShellTask
+from deepks.task.task import BatchTask, GroupBatchTask
+from deepks.task.workflow import Sequence
+from deepks.utils import QCDIR
+SCF_CMD = " ".join([
+    "{python} -u",
+    "-m deepks.scf.run"
+    # os.path.join(QCDIR, "scf/run.py") # this is the backup choice
+])
+TRN_CMD = " ".join([
+    "{python} -u",
+    "-m deepks.model.train"
+    # os.path.join(QCDIR, "train/train.py") # this is the backup choice
+])
+DEFAULT_SCF_RES = {
+    "time_limit": "24:00:00",
+    "cpus_per_task": 8,
+    "mem_limit": 8,
+    "envs": {
+        "PYSCF_MAX_MEMORY": 8000
+    }
+}
+DEFAULT_SCF_SUB_RES = {
+    "numb_node": 1,
+    "task_per_node": 1,
+    "cpus_per_task": 8,
+    "exclusive": True
+}
+DEFAULT_TRN_RES = {
+    "time_limit": "24:00:00",
+    "cpus_per_task": 8,
+    # "numb_gpu": 1, # do not use gpu by default
+    "mem_limit": 8
+}
+def check_system_names(systems):
+    sys_names = [get_sys_name(os.path.basename(s)) for s in systems]
+    if len(set(sys_names)) != len(systems):
+        raise ValueError("Systems have duplicated base names. Not supported yet.")
+def make_cleanup(pattern="slurm-*.out", workdir=".", **task_args):
+    pattern = check_list(pattern)
+    pattern = " ".join(pattern)
+    assert pattern
+    return ShellTask(
+        f"rm -r {pattern}",
+        workdir=workdir,
+        **task_args
+    )
+def make_scf_task(*, workdir=".",
+                  arg_file="scf_input.yaml", source_arg=None,
+                  model_file="model.pth", source_model=None,
+                  proj_basis=None, source_pbasis=None,
+                  systems="systems.raw", link_systems=True, 
+                  dump_dir="results", share_folder="share", 
+                  outlog="log.scf", group_data=None,
+                  dispatcher=None, resources=None, 
+                  python="python", **task_args):
+    # set up basic args
+    command = SCF_CMD.format(python=python)
+    link_share = task_args.pop("link_share_files", [])
+    link_prev = task_args.pop("link_prev_files", [])
+    link_abs = task_args.pop("link_abs_files", [])
+    forward_files = task_args.pop("forward_files", [])
+    backward_files = task_args.pop("backward_files", [])
+    sys_name = None
+    #set up optional args
+    if arg_file:
+        command += f" {arg_file}"
+        if source_arg is not None:
+            link_share.append((source_arg, arg_file))
+        forward_files.append(arg_file)
+    if model_file:
+        command += f" -m {model_file}"
+        if model_file.upper() != "NONE":
+            if source_model is not None:
+                link_prev.append((source_model, model_file))
+            forward_files.append(model_file)
+    if proj_basis:
+        command += f" -P {proj_basis}"
+        if source_pbasis is not None:
+            link_share.append((source_pbasis, proj_basis))
+        forward_files.append(proj_basis)
+    if systems:
+        # check system paths and make forward files
+        sys_paths = [os.path.abspath(s) for s in load_sys_paths(systems)]
+        sys_base = [get_sys_name(s) for s in sys_paths]
+        sys_name = [os.path.basename(s) for s in sys_base]
+        if link_systems:
+            target_dir = "systems"
+            src_files = sum((glob(f"{base}*") for base in sys_base), [])
+            for fl in src_files:
+                dst = os.path.join(target_dir, os.path.basename(fl))
+                link_abs.append((fl, dst))
+            forward_files.append(target_dir)
+            sys_str= os.path.join(target_dir, "*")
+        else: # cannot forward files here
+            sys_str = " ".join(sys_paths)
+        command += f" -s {sys_str}"
+    if dump_dir:
+        command += f" -d {dump_dir}"
+        if sys_name:
+            for nm in sys_name:
+                backward_files.append(os.path.join(dump_dir, nm))
+        else:  # backward whole folder, may cause problem
+            backward_files.append(dump_dir)
+    if group_data is not None:
+        command += " -G" if group_data else " -NG"
+    # make task
+    return BatchTask(
+        command, 
+        workdir=workdir,
+        dispatcher=dispatcher,
+        resources=resources,
+        outlog=outlog,
+        share_folder=share_folder,
+        link_share_files=link_share,
+        link_prev_files=link_prev,
+        link_abs_files=link_abs,
+        forward_files=forward_files,
+        backward_files=backward_files,
+        **task_args
+    )
+def make_run_scf(systems_train, systems_test=None, *,
+                 train_dump="data_train", test_dump="data_test", 
+                 no_model=False, group_data=None,
+                 workdir='.', share_folder='share', outlog="log.scf",
+                 source_arg="scf_input.yaml", source_model="model.pth",
+                 source_pbasis=None, dispatcher=None, resources=None, 
+                 sub_size=1, group_size=1, ingroup_parallel=1, 
+                 sub_res=None, python='python', **task_args):
+    # if no test systems, use last one in train systems
+    systems_train = [os.path.abspath(s) for s in load_sys_paths(systems_train)]
+    systems_test = [os.path.abspath(s) for s in load_sys_paths(systems_test)]
+    if not systems_test:
+        systems_test.append(systems_train[-1])
+        # if len(systems_train) > 1:
+        #     del systems_train[-1]
+    check_system_names(systems_train)
+    check_system_names(systems_test)
+    # split systems into groups
+    nsys_trn = len(systems_train)
+    nsys_tst = len(systems_test)
+    ntask_trn = int(np.ceil(nsys_trn / sub_size))
+    ntask_tst = int(np.ceil(nsys_tst / sub_size))
+    train_sets = [systems_train[i::ntask_trn] for i in range(ntask_trn)]
+    test_sets = [systems_test[i::ntask_tst] for i in range(ntask_tst)]
+    # make subtasks
+    model_file = "../model.pth" if not no_model else "NONE"
+    proj_basis = "../proj_basis.npz" if source_pbasis else None
+    nd = max(len(str(ntask_trn+ntask_tst)), 2)
+    if sub_res is None:
+        sub_res = {}
+    sub_res = {**DEFAULT_SCF_SUB_RES, **sub_res}
+    trn_tasks = [
+        make_scf_task(systems=sset, workdir=f"task.trn.{i:0{nd}}", 
+                      arg_file="../scf_input.yaml", source_arg=None,
+                      model_file=model_file, source_model=None,
+                      proj_basis=proj_basis, source_pbasis=None,
+                      dump_dir=f"../{train_dump}", group_data=group_data,
+                      link_systems=True, resources=sub_res, python=python)
+        for i, sset in enumerate(train_sets)
+    ]
+    tst_tasks = [
+        make_scf_task(systems=sset, workdir=f"task.tst.{i:0{nd}}", 
+                      arg_file="../scf_input.yaml", source_arg=None,
+                      model_file=model_file, source_model=None,
+                      proj_basis=proj_basis, source_pbasis=None,
+                      dump_dir=f"../{test_dump}", group_data=group_data, 
+                      link_systems=True, resources=sub_res, python=python)
+        for i, sset in enumerate(test_sets)
+    ]
+    # set up optional args
+    link_share = task_args.pop("link_share_files", [])
+    link_share.append((source_arg, "scf_input.yaml"))
+    if source_pbasis:
+        link_share.append((source_pbasis, "proj_basis.npz"))
+    link_prev = task_args.pop("link_prev_files", [])
+    if not no_model:
+        link_prev.append((source_model, "model.pth"))
+    if resources is None:
+        resources = {}
+    resources = {**DEFAULT_SCF_RES, "numb_node": ingroup_parallel, **resources}
+    # make task
+    return GroupBatchTask(
+        trn_tasks + tst_tasks,
+        workdir=workdir,
+        group_size=group_size,
+        ingroup_parallel=ingroup_parallel,
+        dispatcher=dispatcher,
+        resources=resources,
+        outlog=outlog,
+        errlog="err",
+        share_folder=share_folder,
+        link_share_files=link_share,
+        link_prev_files=link_prev
+    )
+def make_stat_scf(systems_train, systems_test=None, *, 
+                  train_dump="data_train", test_dump="data_test", group_data=False,
+                  workdir='.', outlog="log.data", **stat_args):
+    # follow same convention for systems as run_scf
+    systems_train = [os.path.abspath(s) for s in load_sys_paths(systems_train)]
+    systems_test = [os.path.abspath(s) for s in load_sys_paths(systems_test)]
+    if not systems_test:
+        systems_test.append(systems_train[-1])
+        # if len(systems_train) > 1:
+        #     del systems_train[-1]
+    # load stats function
+    from deepks.scf.stats import print_stats
+    stat_args.update(
+        systems=systems_train,
+        test_sys=systems_test,
+        dump_dir=train_dump,
+        test_dump=test_dump,
+        group=group_data)
+    # make task
+    return PythonTask(
+        print_stats,
+        call_kwargs=stat_args,
+        outlog=outlog,
+        errlog="err",
+        workdir=workdir
+    )
+def make_scf(systems_train, systems_test=None, *,
+             train_dump="data_train", test_dump="data_test",
+             no_model=False, workdir='00.scf', share_folder='share',
+             source_arg="scf_input.yaml", source_model="model.pth",
+             source_pbasis=None, dispatcher=None, resources=None, 
+             sub_size=1, group_size=1, ingroup_parallel=1, 
+             sub_res=None, python='python', 
+             cleanup=False, **task_args):
+    run_scf = make_run_scf(
+        systems_train, systems_test,
+        train_dump=train_dump, test_dump=test_dump, 
+        no_model=no_model, group_data=False,
+        workdir=".", outlog="log.scf", share_folder=share_folder, 
+        source_arg=source_arg, source_model=source_model, source_pbasis=source_pbasis,
+        dispatcher=dispatcher, resources=resources, 
+        group_size=group_size, ingroup_parallel=ingroup_parallel,
+        sub_size=sub_size, sub_res=sub_res, python=python, **task_args
+    )
+    post_scf = make_stat_scf(
+        systems_train=systems_train, systems_test=systems_test,
+        train_dump=train_dump, test_dump=test_dump, workdir=".", 
+        outlog="log.data", group_data=False
+    )
+    # concat
+    seq = [run_scf, post_scf]
+    if cleanup:
+        clean_scf = make_cleanup(
+            ["slurm-*.out", "task.*/err", "fin.record"],
+            workdir=".")
+        seq.append(clean_scf)
+    # make sequence
+    return Sequence(
+        seq,
+        workdir=workdir
+    )
+def make_train_task(*, workdir=".",
+                    arg_file="train_input.yaml", source_arg=None,
+                    restart_model=None, source_model=None, 
+                    proj_basis=None, source_pbasis=None,
+                    save_model="model.pth", group_data=False,
+                    data_train="data_train", source_train=None,
+                    data_test="data_test", source_test=None,
+                    share_folder="share", outlog="log.train",
+                    dispatcher=None, resources=None, 
+                    python="python", **task_args):
+    # set up basic args
+    command = TRN_CMD.format(python=python)
+    link_share = task_args.pop("link_share_files", [])
+    link_prev = task_args.pop("link_prev_files", [])
+    forward_files = task_args.pop("forward_files", [])
+    backward_files = task_args.pop("backward_files", [])
+    # set up optional args
+    if arg_file:
+        command += f" {arg_file}"
+        if source_arg is not None:
+            link_share.append((source_arg, arg_file))
+        forward_files.append(arg_file)
+    if restart_model:
+        command += f" -r {restart_model}"
+        if source_model is not None:
+            link_prev.append((source_model, restart_model))
+        forward_files.append(restart_model)
+    if proj_basis:
+        command += f" -P {proj_basis}"
+        if source_pbasis is not None:
+            link_share.append((source_pbasis, proj_basis))
+        forward_files.append(proj_basis)
+    if data_train:
+        command += f" -d {data_train}" + ("" if group_data else "/*")
+        if source_train is not None:
+            link_prev.append((source_train, data_train))
+        forward_files.append(data_train)
+    if data_test:
+        command += f" -t {data_test}" + ("" if group_data else "/*")
+        if source_test is not None:
+            link_prev.append((source_test, data_test))
+        forward_files.append(data_test)
+    if save_model:
+        command += f" -o {save_model}"
+        backward_files.append(save_model)
+    if resources is None:
+        resources = {}
+    resources = {**DEFAULT_TRN_RES, **resources}
+    # make task
+    return BatchTask(
+        command,
+        workdir=workdir,
+        dispatcher=dispatcher,
+        resources=resources,
+        outlog=outlog,
+        errlog='err',
+        share_folder=share_folder,
+        link_share_files=link_share,
+        link_prev_files=link_prev,
+        forward_files=forward_files,
+        backward_files=backward_files,
+        **task_args
+    )
+def make_run_train(source_train="data_train", source_test="data_test", *,
+                   restart=True, source_model="model.pth", save_model="model.pth", 
+                   source_pbasis=None, source_arg="train_input.yaml", 
+                   workdir=".", share_folder="share", outlog="log.train",
+                   dispatcher=None, resources=None, 
+                   python="python", **task_args):
+    # just add some presetted arguments of make_train_task
+    # have not implement parrallel training for multiple models
+    restart_model = "old_model.pth" if restart else None
+    proj_basis = "proj_basis.npz" if source_pbasis else None
+    return make_train_task(
+        workdir=workdir, 
+        arg_file="train_input.yaml", source_arg=source_arg,
+        restart_model=restart_model, source_model=source_model, 
+        proj_basis=proj_basis, source_pbasis=source_pbasis,
+        save_model=save_model, group_data=False,
+        data_train="data_train", source_train=source_train,
+        data_test="data_test", source_test=source_test,
+        share_folder=share_folder, outlog=outlog,
+        dispatcher=dispatcher, resources=resources,
+        python=python, **task_args
+    )
+def make_test_train(data_paths, model_file="model.pth", *,
+                    output_prefix="test", group_results=True, 
+                    workdir='.', outlog="log.test", **test_args):
+    from deepks.model.test import main as test_func
+    test_args.update(
+        data_paths=data_paths,
+        model_file=model_file,
+        output_prefix=output_prefix,
+        group=group_results)
+    # make task
+    return PythonTask(
+        test_func,
+        call_kwargs=test_args,
+        outlog=outlog,
+        errlog="err",
+        workdir=workdir
+    )
+def make_train(source_train="data_train", source_test="data_test", *,
+               restart=True, source_model="model.pth", save_model="model.pth",
+               source_pbasis=None, source_arg="train_input.yaml", 
+               workdir="01.train", share_folder="share",
+               dispatcher=None, resources=None, 
+               python="python", cleanup=False, **task_args):
+    run_train = make_run_train(
+        source_train=source_train, source_test=source_test,
+        restart=restart, source_model=source_model, save_model=save_model,
+        source_pbasis=source_pbasis, source_arg=source_arg, 
+        workdir=".", share_folder=share_folder,
+        outlog="log.train", dispatcher=dispatcher, resources=resources,
+        python=python, **task_args
+    )
+    post_train = make_test_train(
+        data_paths=["data_train/*","data_test/*"],
+        model_file=save_model, output_prefix="test", group_results=True,
+        workdir=".", outlog="log.test"
+    )
+    # concat
+    seq = [run_train, post_train]
+    if cleanup:
+        clean_train = make_cleanup(
+            ["slurm-*.out", "err", "fin.record", "tag_*finished"],
+            workdir=".")
+        seq.append(clean_train)
+    # make sequence
+    return Sequence(
+        seq,
+        workdir=workdir
+    )
\ No newline at end of file
--- a/deepks/main.py
+++ b/deepks/main.py
+import os
+import sys
+import argparse
+try:
+    import deepks
+except ImportError as e:
+    sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/../")
+from deepks.utils import load_yaml, deep_update
+def main_cli(args=None):
+    parser = argparse.ArgumentParser(
+                prog="deepks",
+                description="A program to generate accurate energy functionals.")
+    parser.add_argument("command", 
+                        help="specify the sub-command to run, possible choices: "
+                             "train, test, scf, stats, iterate")
+    parser.add_argument("args", nargs=argparse.REMAINDER,
+                        help="arguments to be passed to the sub-command")
+    args = parser.parse_args(args)
+    # sepatate all sub_cli to make them useable independently 
+    if args.command.upper() == "TRAIN":
+        sub_cli = train_cli
+    elif args.command.upper() == "TEST":
+        sub_cli = test_cli
+    elif args.command.upper() == "SCF":
+        sub_cli = scf_cli
+    elif args.command.upper() == "STATS":
+        sub_cli = stats_cli
+    elif args.command.upper().startswith("ITER"):
+        sub_cli = iter_cli
+    else:
+        return ValueError(f"unsupported sub-command: {args.command}")
+    sub_cli(args.args)
+def train_cli(args=None):
+    parser = argparse.ArgumentParser(
+                prog="deepks train",
+                description="Train a model according to given input.",
+                argument_default=argparse.SUPPRESS)
+    parser.add_argument('input', type=str, nargs="?",
+                        help='the input yaml file for args')
+    parser.add_argument('-r', '--restart',
+                        help='the restart file to load model from, would ignore model_args if given')
+    parser.add_argument('-d', '--train-paths', nargs="*",
+                        help='paths to the folders of training data')
+    parser.add_argument('-t', '--test-paths', nargs="*",
+                        help='paths to the folders of testing data')
+    parser.add_argument('-o', '--ckpt-file',
+                        help='file to save the model parameters, default: model.pth')
+    parser.add_argument("-P", "--proj_basis",
+                        help="basis set used to project density matrix")   
+    parser.add_argument('-S', '--seed', type=int,
+                        help='use specified seed in initialization and training')
+    parser.add_argument("-D", "--device",
+                        help="device name used in training the model")    
+    args = parser.parse_args(args)
+    if hasattr(args, "input"):
+        argdict = load_yaml(args.input)
+        del args.input
+        argdict.update(vars(args))
+    else:
+        argdict = vars(args)
+    from deepks.model.train import main
+    main(**argdict)
+def test_cli(args=None):
+    parser = argparse.ArgumentParser(
+                prog="deepks test",
+                description="Test a model with given data (Not SCF).",
+                argument_default=argparse.SUPPRESS)
+    parser.add_argument("input", nargs="?",
+                        help='the input yaml file used for training')
+    parser.add_argument("-d", "--data-paths", type=str, nargs='+',
+                        help="the paths to data folders containing .npy files for test")
+    parser.add_argument("-m", "--model-file", type=str, nargs='+',
+                        help="the dumped model file to test")
+    parser.add_argument("-o", "--output-prefix", type=str,
+                        help=r"the prefix of output file, would wite into file %%prefix.%%sysidx.out")
+    parser.add_argument("-E", "--e-name", type=str,
+                        help="the name of energy file to be read (no .npy extension)")
+    parser.add_argument("-D", "--d-name", type=str, nargs="+",
+                        help="the name of descriptor file(s) to be read (no .npy extension)")
+    parser.add_argument("-G", "--group", action='store_true',
+                        help="group test results for all systems")
+    args = parser.parse_args(args)
+    if hasattr(args, "input"):
+        rawdict = load_yaml(args.input)
+        del args.input
+        argdict = {}
+        if "ckpt_file" in rawdict["train_args"]:
+            argdict["model_file"] = rawdict["train_args"]["ckpt_file"]
+        if "e_name" in rawdict["data_args"]:
+            argdict["e_name"] = rawdict["data_args"]["e_name"]
+        if "d_name" in rawdict["data_args"]:
+            argdict["d_name"] = rawdict["data_args"]["d_name"]
+        if "test_paths" in rawdict:
+            argdict["data_paths"] = rawdict["test_paths"]
+        argdict.update(vars(args))
+    else:
+        argdict = vars(args)
+    from deepks.model.test import main
+    main(**argdict)
+def scf_cli(args=None):
+    parser = argparse.ArgumentParser(
+                prog="deepks scf",
+                description="Calculate and save SCF results using given model.",
+                argument_default=argparse.SUPPRESS)
+    parser.add_argument("input", nargs="?",
+                        help='the input yaml file for args')
+    parser.add_argument("-s", "--systems", nargs="*",
+                        help="input molecule systems, can be xyz files or folders with npy data")
+    parser.add_argument("-m", "--model-file",
+                        help="file of the trained model")
+    parser.add_argument("-d", "--dump-dir",
+                        help="dir of dumped files")
+    parser.add_argument("-v", "--verbose", type=int, choices=range(0,6),
+                        help="output level of calculation information")
+    parser.add_argument("-F", "--dump-fields", nargs="*",
+                        help="fields to be dumped into the folder") 
+    parser.add_argument("-B", "--basis",
+                        help="basis set used to solve the model") 
+    parser.add_argument("-P", "--proj_basis",
+                        help="basis set used to project dm, must match with model")   
+    parser.add_argument("-D", "--device",
+                        help="device name used in nn model inference")               
+    group0 = parser.add_mutually_exclusive_group()   
+    group0.add_argument("-G", "--group", action='store_true', dest="group",
+                        help="group results for all systems, only works for same number of atoms")
+    group0.add_argument("-NG", "--no-group", action='store_false', dest="group",
+                        help="Do not group results for different systems (default behavior)")
+    parser.add_argument("-X", "--scf-xc",
+                        help="base xc functional used in scf equation, default is HF")        
+    parser.add_argument("--scf-conv-tol", type=float,
+                        help="converge threshold of scf iteration")
+    parser.add_argument("--scf-conv-tol-grad", type=float,
+                        help="gradient converge threshold of scf iteration")
+    parser.add_argument("--scf-max-cycle", type=int,
+                        help="max number of scf iteration cycles")
+    parser.add_argument("--scf-diis-space", type=int,
+                        help="subspace dimension used in diis mixing")
+    parser.add_argument("--scf-level-shift", type=float,
+                        help="level shift used in scf calculation")
+    args = parser.parse_args(args)
+    scf_args={}
+    for k, v in vars(args).copy().items():
+        if k.startswith("scf_"):
+            scf_args[k[4:]] = v
+            delattr(args, k)
+    if hasattr(args, "input"):
+        argdict = load_yaml(args.input)
+        del args.input
+        argdict.update(vars(args))
+        argdict["scf_args"].update(scf_args)
+    else:
+        argdict = vars(args)
+        argdict["scf_args"] = scf_args
+    from deepks.scf.run import main
+    main(**argdict)
+def stats_cli(args=None):
+    parser = argparse.ArgumentParser(
+                prog="deepks stats",
+                description="Print the stats of SCF results.",
+                argument_default=argparse.SUPPRESS)
+    parser.add_argument("input", nargs="?",
+                        help='the input yaml file used for SCF calculation')
+    parser.add_argument("-s", "--systems", nargs="*",
+                        help='system paths used as training set (i.e. calculate shift)')
+    parser.add_argument("-d", "--dump-dir",
+                        help="directory used to save SCF results of training systems")
+    parser.add_argument("-ts", "--test-sys", nargs="*",
+                        help='system paths used as testing set (i.e. not calculate shift)')
+    parser.add_argument("-td", "--test-dump",
+                        help="directory used to save SCF results of testing systems")
+    parser.add_argument("-G", "--group", action='store_true',
+                        help="if set, assume results are grouped")
+    parser.add_argument("-NC", action="store_false", dest="with_conv",
+                        help="do not print convergence results")
+    parser.add_argument("-NE", action="store_false", dest="with_e",
+                        help="do not print energy results")
+    parser.add_argument("-NF", action="store_false", dest="with_f",
+                        help="do not print force results")
+    parser.add_argument("--e-name",
+                        help="name of the energy file (no extension)")
+    parser.add_argument("--f-name",
+                        help="name of the force file (no extension)")
+    args = parser.parse_args(args)
+    if hasattr(args, "input"):
+        rawdict = load_yaml(args.input)
+        del args.input
+        argdict = {fd: rawdict[fd]
+                     for fd in ("systems", "dump_dir", "group")
+                     if fd in rawdict}
+        argdict.update(vars(args))
+    else:
+        argdict = vars(args)
+    from deepks.scf.stats import print_stats
+    print_stats(**argdict)
+def iter_cli(args=None):
+    parser = argparse.ArgumentParser(
+                prog="deepks iterate",
+                description="Run the iteration procedure to train a SCF model.",
+                argument_default=argparse.SUPPRESS)
+    parser.add_argument("argfile", nargs="*", default=[],
+                        help='the input yaml file for args, '
+                             'if more than one, the latter has higher priority')
+    parser.add_argument("-s", "--systems-train", nargs="*",
+                        help='systems for training, '
+                             'can be xyz files or folders with npy data')
+    parser.add_argument("-t", "--systems-test", nargs="*",
+                        help='systems for training, '
+                             'can be xyz files or folders with npy data')
+    parser.add_argument("-n", "--n-iter", type=int,
+                        help='the number of iterations to run')
+    parser.add_argument("--workdir",
+                        help='working directory, default is current directory')
+    parser.add_argument("--share-folder", 
+                        help='folder to store share files, default is "share"')
+    parser.add_argument("--cleanup", action="store_true", dest="cleanup",
+                        help='if set, clean up files used for job dispatching')
+    parser.add_argument("--no-strict", action="store_false", dest="strict",
+                        help='if set, allow other arguments to be passed to task')
+    # allow cli specified argument files
+    sub_names = ["scf-input", "scf-machine", "train-input", "train-machine",
+                 "init-model", "init-scf", "init-train"]
+    for name in sub_names:
+        parser.add_argument(f"--{name}",
+            help='if specified, subsitude the original arguments with given file')
+    args = parser.parse_args(args)
+    argdict = {}
+    for fl in args.argfile:
+        argdict = deep_update(argdict, load_yaml(fl))
+    del args.argfile
+    argdict.update(vars(args))
+    from deepks.iterate.iterate import main
+    main(**argdict)
+if __name__ == "__main__":
+    main_cli()
\ No newline at end of file
--- a/deepks/model/__init__.py
+++ b/deepks/model/__init__.py
+__all__ = [
+    "model",
+    "reader",
+    "train",
+    "test",
+]
+def __getattr__(name):
+    from importlib import import_module
+    if name == "CorrNet":
+        from .model import CorrNet
+        return CorrNet
+    if name in __all__:
+        return import_module("." + name, __name__)
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
--- a/deepks/model/__main__.py
+++ b/deepks/model/__main__.py
+import os
+import sys
+try:
+    import deepks
+except ImportError as e:
+    sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/../")
+from deepks.main import train_cli
+if __name__ == "__main__":
+    train_cli()
\ No newline at end of file
--- a/deepks/model/model.py
+++ b/deepks/model/model.py
+import math
+import inspect
+import numpy as np
+import torch
+import torch.nn as nn 
+from torch.nn import functional as F
+from deepks.utils import load_basis, get_shell_sec
+from deepks.utils import load_elem_table, save_elem_table
+SCALE_EPS = 1e-8
+def parse_actv_fn(code):
+    if callable(code):
+        return code
+    assert type(code) is str
+    lcode = code.lower()
+    if lcode == 'sigmoid':
+        return torch.sigmoid
+    if lcode == 'tanh':
+        return torch.tanh
+    if lcode == 'relu':
+        return torch.relu
+    if lcode == 'softplus':
+        return F.softplus
+    if lcode == 'silu':
+        return F.silu
+    if lcode == 'gelu':
+        return F.gelu
+    if lcode == 'mygelu':
+        return mygelu
+    raise ValueError(f'{code} is not a valid activation function')
+def make_embedder(type, shell_sec, **kwargs):
+    ltype = type.lower()
+    if ltype in ("trace", "sum"):
+        EmbdCls = TraceEmbedding
+    elif ltype in ("thermal", "softmax"):
+        EmbdCls = ThermalEmbedding
+    else:
+        raise ValueError(f'{type} is not a valid embedding type')
+    embedder = EmbdCls(shell_sec, **kwargs)
+    return embedder
+def mygelu(x):
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+def log_args(name):
+    def decorator(func):
+        def warpper(self, *args, **kwargs):
+            args_dict = inspect.getcallargs(func, self, *args, **kwargs)
+            del args_dict['self']
+            setattr(self, name, args_dict)
+            func(self, *args, **kwargs)
+        return warpper
+    return decorator
+def make_shell_mask(shell_sec):
+    lsize = len(shell_sec)
+    msize = max(shell_sec)
+    mask = torch.zeros(lsize, msize, dtype=bool)
+    for l, m in enumerate(shell_sec):
+        mask[l, :m] = 1
+    return mask
+def pad_lastdim(sequences, padding_value=0):
+    # assuming trailing dimensions and type of all the Tensors
+    # in sequences are same and fetching those from sequences[0]
+    max_size = sequences[0].size()
+    front_dims = max_size[:-1]
+    max_len = max([s.size(-1) for s in sequences])
+    out_dims = front_dims + (len(sequences), max_len)
+    out_tensor = sequences[0].new_full(out_dims, padding_value)
+    for i, tensor in enumerate(sequences):
+        length = tensor.size(-1)
+        # use index notation to prevent duplicate references to the tensor
+        out_tensor[..., i, :length] = tensor
+    return out_tensor
+def pad_masked(tensor, mask, padding_value=0):
+    # equiv to pad_lastdim(tensor.split(shell_sec, dim=-1))
+    assert tensor.shape[-1] == mask.sum()
+    new_shape = tensor.shape[:-1] + mask.shape
+    return tensor.new_full(new_shape, padding_value).masked_scatter_(mask, tensor) 
+def unpad_lastdim(padded, length_list):
+    # inverse of pad_lastdim
+    return [padded[...,i,:length] for i, length in enumerate(length_list)]
+def unpad_masked(padded, mask):
+    # equiv to torch.cat(unpad_lastdim(padded, shell_sec), dim=-1)
+    new_shape = padded.shape[:-mask.ndim] + (mask.sum(),)
+    return torch.masked_select(padded, mask).reshape(new_shape)
+def masked_softmax(input, mask, dim=-1):
+    exps = torch.exp(input - input.max(dim=dim, keepdim=True)[0])
+    mexps = exps * mask.to(exps)
+    msums = mexps.sum(dim=dim, keepdim=True).clamp(1e-10)
+    return mexps / msums
+class DenseNet(nn.Module):
+    def __init__(self, sizes, actv_fn=torch.relu, use_resnet=True, with_dt=False):
+        super().__init__()
+        self.layers = nn.ModuleList([nn.Linear(in_f, out_f) 
+                                     for in_f, out_f in zip(sizes, sizes[1:])])
+        self.actv_fn = actv_fn
+        self.use_resnet = use_resnet
+        if with_dt:
+            self.dts = nn.ParameterList(
+                [nn.Parameter(torch.normal(torch.ones(out_f), std=0.01)) 
+                    for out_f in sizes[1:]])
+        else:
+            self.dts = None
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            tmp = layer(x)
+            if i < len(self.layers) - 1:
+                tmp = self.actv_fn(tmp)
+            if self.use_resnet and layer.in_features == layer.out_features:
+                if self.dts is not None:
+                    tmp = tmp * self.dts[i]
+                x = x + tmp
+            else:
+                x = tmp
+        return x
+class TraceEmbedding(nn.Module):
+    def __init__(self, shell_sec):
+        super().__init__()
+        self.shell_sec = shell_sec
+        self.ndesc = len(shell_sec)
+    def forward(self, x):
+        x_shells = x.split(self.shell_sec, dim=-1)
+        tr_shells = [sx.sum(-1, keepdim=True) for sx in x_shells]
+        return torch.cat(tr_shells, dim=-1)
+class ThermalEmbedding(nn.Module):
+    def __init__(self, shell_sec, embd_sizes=None, init_beta=5., 
+                 momentum=None, max_memory=1000):
+        super().__init__()
+        self.shell_sec = shell_sec
+        self.register_buffer("shell_mask", make_shell_mask(shell_sec), False)# shape: [l, m]
+        if embd_sizes is None:
+            embd_sizes = shell_sec
+        if isinstance(embd_sizes, int):
+            embd_sizes = [embd_sizes] * len(shell_sec)
+        assert len(embd_sizes) == len(shell_sec)
+        self.embd_sizes = embd_sizes
+        self.register_buffer("embd_mask", make_shell_mask(embd_sizes), False)
+        self.ndesc = sum(embd_sizes)
+        self.beta = nn.Parameter( # shape: [l, p], padded
+            pad_lastdim([torch.linspace(init_beta, -init_beta, ne) 
+                            for ne in embd_sizes]))
+        self.momentum = momentum
+        self.max_memory = max_memory
+        self.register_buffer('running_mean', torch.zeros(len(shell_sec)))
+        self.register_buffer('running_var', torch.ones(len(shell_sec)))
+        self.register_buffer('num_batches_tracked', torch.tensor(0, dtype=torch.long))
+    def forward(self, x):
+        x_padded = pad_masked(x, self.shell_mask, 0.) # shape: [n, a, l, m]
+        if self.training:
+            self.update_running_stats(x_padded)
+        nx_padded = ((x_padded - self.running_mean.unsqueeze(-1)) 
+                    / (self.running_var.sqrt().unsqueeze(-1) + SCALE_EPS)
+                    * self.shell_mask.to(x_padded))
+        weight = masked_softmax(
+            torch.einsum("...lm,lp->...lmp", nx_padded, -self.beta),
+            self.shell_mask.unsqueeze(-1), dim=-2)
+        desc_padded = torch.einsum("...m,...mp->...p", x_padded, weight)
+        return unpad_masked(desc_padded, self.embd_mask)
+    def update_running_stats(self, x_padded):
+        self.num_batches_tracked += 1
+        if self.momentum is None and self.num_batches_tracked > self.max_memory:
+            return # stop update after 1000 batches, so the scaling becomes a fixed parameter
+        exp_factor = 1. - 1. / float(self.num_batches_tracked)
+        if self.momentum is not None:
+            exp_factor = max(exp_factor, self.momentum)
+        with torch.no_grad():
+            fmask = self.shell_mask.to(x_padded)
+            pad_portion = fmask.mean(-1)
+            x_masked = x_padded * fmask # make sure padded part is zero
+            reduced_dim = (*range(x_masked.ndim-2), -1)
+            batch_mean = x_masked.mean(reduced_dim) / pad_portion
+            batch_var = x_masked.var(reduced_dim) / pad_portion
+            self.running_mean[:] = exp_factor * self.running_mean + (1-exp_factor) * batch_mean
+            self.running_var[:] = exp_factor * self.running_var + (1-exp_factor) * batch_var
+    def reset_running_stats(self):
+        self.running_mean.zero_()
+        self.running_var.fill_(1)
+        self.num_batches_tracked.zero_()
+class CorrNet(nn.Module):
+    @log_args('_init_args')
+    def __init__(self, input_dim, hidden_sizes=(100,100,100), 
+                 actv_fn='gelu', use_resnet=True, 
+                 embedding=None, proj_basis=None, elem_table=None,
+                 input_shift=0, input_scale=1, output_scale=1):
+        super().__init__()
+        actv_fn = parse_actv_fn(actv_fn)
+        self.input_dim = input_dim
+        # basis info
+        self._pbas = load_basis(proj_basis)
+        self._init_args["proj_basis"] = self._pbas
+        self.shell_sec = None
+        # elem const
+        if isinstance(elem_table, str):
+            elem_table = load_elem_table(elem_table)
+            self._init_args["elem_table"] = elem_table
+        self.elem_table = elem_table
+        self.elem_dict = None if elem_table is None else dict(zip(*elem_table))
+        # linear fitting
+        self.linear = nn.Linear(input_dim, 1).double()
+        # embedding net
+        ndesc = input_dim
+        self.embedder = None
+        if embedding is not None:
+            if isinstance(embedding, str):
+                embedding = {"type": embedding}
+            assert isinstance(embedding, dict)
+            raw_shell_sec = get_shell_sec(self._pbas)
+            self.shell_sec = raw_shell_sec * (input_dim // sum(raw_shell_sec))
+            assert sum(self.shell_sec) == input_dim
+            self.embedder = make_embedder(**embedding, shell_sec=self.shell_sec).double()
+            self.linear.requires_grad_(False) # make sure it is symmetric
+            ndesc = self.embedder.ndesc
+        # fitting net
+        layer_sizes = [ndesc, *hidden_sizes, 1]
+        self.densenet = DenseNet(layer_sizes, actv_fn, use_resnet).double()
+        # scaling part
+        self.input_shift = nn.Parameter(
+            torch.tensor(input_shift, dtype=torch.float64).expand(input_dim).clone(), 
+            requires_grad=False)
+        self.input_scale = nn.Parameter(
+            torch.tensor(input_scale, dtype=torch.float64).expand(input_dim).clone(), 
+            requires_grad=False)
+        self.output_scale = nn.Parameter(
+            torch.tensor(output_scale, dtype=torch.float64), 
+            requires_grad=False)
+        self.energy_const = nn.Parameter(
+            torch.tensor(0, dtype=torch.float64), 
+            requires_grad=False)
+    def forward(self, x):
+        # x: nframes x natom x nfeature
+        x = (x - self.input_shift) / (self.input_scale + SCALE_EPS)
+        l = self.linear(x)
+        if self.embedder is not None:
+            x = self.embedder(x)
+        y = self.densenet(x)
+        y = y / self.output_scale + l
+        e = y.sum(-2) + self.energy_const
+        return e
+    def get_elem_const(self, elems):
+        if self.elem_dict is None:
+            return 0.
+        return sum(self.elem_dict[ee] for ee in elems)
+    def set_normalization(self, shift=None, scale=None):
+        dtype = self.input_scale.dtype
+        if shift is not None:
+            self.input_shift.data[:] = torch.tensor(shift, dtype=dtype)
+        if scale is not None:
+            self.input_scale.data[:] = torch.tensor(scale, dtype=dtype)
+    def set_prefitting(self, weight, bias, trainable=False):
+        dtype = self.linear.weight.dtype
+        self.linear.weight.data[:] = torch.tensor(weight, dtype=dtype).reshape(-1)
+        self.linear.bias.data[:] = torch.tensor(bias, dtype=dtype).reshape(-1)
+        self.linear.requires_grad_(trainable)
+    def set_energy_const(self, const):
+        dtype = self.energy_const.dtype
+        self.energy_const.data = torch.tensor(const, dtype=dtype).reshape([])
+    def save_dict(self, **extra_info):
+        dump_dict = {
+            "state_dict": self.state_dict(),
+            "init_args": self._init_args,
+            "extra_info": extra_info
+        }
+        return dump_dict
+    def save(self, filename, **extra_info):
+        torch.save(self.save_dict(**extra_info), filename)
+    def compile(self, set_eval=True, **kwargs):
+        old_mode = self.training
+        if set_eval:
+            self.eval()
+        smodel = torch.jit.trace(
+            self.forward, 
+            torch.empty((2, 2, self.input_dim)),
+            **kwargs)
+        self.train(old_mode)
+        return smodel
+    def compile_save(self, filename, **kwargs):
+        torch.jit.save(self.compile(**kwargs), filename)
+        if self.elem_table is not None:
+            save_elem_table(filename+".elemtab", self.elem_table)
+    @staticmethod
+    def load_dict(checkpoint, strict=False):
+        init_args = checkpoint["init_args"]
+        if "layer_sizes" in init_args:
+            layers = init_args.pop("layer_sizes")
+            init_args["input_dim"] = layers[0]
+            init_args["hidden_sizes"] = layers[1:-1]
+        model = CorrNet(**init_args)
+        model.load_state_dict(checkpoint['state_dict'], strict=strict)
+        return model
+    @staticmethod
+    def load(filename, strict=False):
+        try:
+            return torch.jit.load(filename)
+        except RuntimeError:
+            checkpoint = torch.load(filename, map_location="cpu")
+            return CorrNet.load_dict(checkpoint, strict=strict)
--- a/deepks/model/reader.py
+++ b/deepks/model/reader.py
+import os,time,sys
+import numpy as np
+import torch
+def concat_batch(tdicts, dim=0):
+    keys = tdicts[0].keys()
+    assert all(d.keys() == keys for d in tdicts)
+    return {
+        k: torch.cat([d[k] for d in tdicts], dim) 
+        for k in keys
+    }
+def split_batch(tdict, size, dim=0):
+    dsplit = {k: torch.split(v, size, dim) for k,v in tdict.items()}
+    nsecs = [len(v) for v in dsplit.values()]
+    assert all(ns == nsecs[0] for ns in nsecs)
+    return [
+        {k: v[i] for k, v in dsplit.items()}
+        for i in range(nsecs[0])
+    ]
+class Reader(object):
+    def __init__(self, data_path, batch_size, 
+                 e_name="l_e_delta", d_name="dm_eig", 
+                 f_name="l_f_delta", gvx_name="grad_vx", 
+                 eg_name="eg_base", gveg_name="grad_veg", 
+                 gldv_name="grad_ldv", conv_name="conv", 
+                 atom_name="atom", **kwargs):
+        self.data_path = data_path
+        self.batch_size = batch_size
+        self.e_path = self.check_exist(e_name+".npy")
+        self.f_path = self.check_exist(f_name+".npy")
+        self.d_path = self.check_exist(d_name+".npy")
+        self.gvx_path = self.check_exist(gvx_name+".npy")
+        self.eg_path = self.check_exist(eg_name+".npy")
+        self.gveg_path = self.check_exist(gveg_name+".npy")
+        self.gldv_path = self.check_exist(gldv_name+".npy")
+        self.c_path = self.check_exist(conv_name+".npy")
+        self.a_path = self.check_exist(atom_name+".npy")
+        # load data
+        self.load_meta()
+        self.prepare()
+        # initialize sample index queue
+        self.idx_queue = []
+    def check_exist(self, fname):
+        if fname is None:
+            return None
+        fpath = os.path.join(self.data_path, fname)
+        if os.path.exists(fpath):
+            return fpath
+    def load_meta(self):
+        try:
+            sys_meta = np.loadtxt(self.check_exist('system.raw'), dtype = int).reshape([-1])
+            self.natm = sys_meta[0]
+            self.nproj = sys_meta[-1]
+        except:
+            print('#', self.data_path, f"no system.raw, infer meta from data", file=sys.stderr)
+            sys_shape = np.load(self.d_path).shape
+            assert len(sys_shape) == 3, \
+                f"descriptor has to be an order-3 array with shape [nframes, natom, nproj]"
+            self.natm = sys_shape[1]
+            self.nproj = sys_shape[2]
+        self.ndesc = self.nproj
+    def prepare(self):
+        # load energy and check nframes
+        data_ec = np.load(self.e_path).reshape(-1, 1)
+        raw_nframes = data_ec.shape[0]
+        data_dm = np.load(self.d_path).reshape(raw_nframes, self.natm, self.ndesc)
+        if self.c_path is not None:
+            conv = np.load(self.c_path).reshape(raw_nframes)
+        else:
+            conv = np.ones(raw_nframes, dtype=bool)
+        self.data_ec = data_ec[conv]
+        self.data_dm = data_dm[conv]
+        self.nframes = conv.sum()
+        if self.nframes < self.batch_size:
+            self.batch_size = self.nframes
+            print('#', self.data_path, 
+                 f"reset batch size to {self.batch_size}", file=sys.stderr)
+        # handle atom and element data
+        self.atom_info = {}
+        if self.a_path is not None:
+            atoms = np.load(self.a_path).reshape(raw_nframes, self.natm, 4)
+            self.atom_info["elems"] = atoms[:, :, 0][conv].round().astype(int)
+            self.atom_info["coords"] = atoms[:, :, 1:][conv]
+        # load data in torch
+        self.t_data = {}
+        self.t_data["lb_e"] = torch.tensor(self.data_ec)
+        self.t_data["eig"] = torch.tensor(self.data_dm)
+        if self.f_path is not None and self.gvx_path is not None:
+            self.t_data["lb_f"] = torch.tensor(
+                np.load(self.f_path)\
+                  .reshape(raw_nframes, -1, 3)[conv])
+            self.t_data["gvx"] = torch.tensor(
+                np.load(self.gvx_path)\
+                  .reshape(raw_nframes, -1, 3, self.natm, self.ndesc)[conv])
+        if self.eg_path is not None and self.gveg_path is not None:
+            self.t_data['eg0'] = torch.tensor(
+                np.load(self.eg_path)\
+                  .reshape(raw_nframes, -1)[conv])
+            self.t_data["gveg"] = torch.tensor(
+                np.load(self.gveg_path)\
+                  .reshape(raw_nframes, self.natm, self.ndesc, -1)[conv])
+            self.neg = self.t_data['eg0'].shape[-1]
+        if self.gldv_path is not None:
+            self.t_data["gldv"] = torch.tensor(
+                np.load(self.gldv_path)\
+                  .reshape(raw_nframes, self.natm, self.ndesc)[conv])
+    def sample_train(self):
+        if self.batch_size == self.nframes == 1:
+            return self.sample_all()
+        if len(self.idx_queue) < self.batch_size:
+            self.idx_queue = np.random.choice(self.nframes, self.nframes, replace=False)
+        sample_idx = self.idx_queue[:self.batch_size]
+        self.idx_queue = self.idx_queue[self.batch_size:]
+        return {k: v[sample_idx] for k, v in self.t_data.items()}
+    def sample_all(self):
+        return self.t_data
+    def get_train_size(self):
+        return self.nframes
+    def get_batch_size(self):
+        return self.batch_size
+    def get_nframes(self):
+        return self.nframes
+    def collect_elems(self, elem_list):
+        if "elem_list" in self.atom_info:
+            assert list(elem_list) == list(self.atom_info["elem_list"])
+            return self.atom_info["nelem"]
+        elem_to_idx = np.zeros(200, dtype=int) + 200
+        for ii, ee in enumerate(elem_list):
+            elem_to_idx[ee] = ii
+        idxs = elem_to_idx[self.atom_info["elems"]]
+        nelem = np.zeros((self.nframes, len(elem_list)), int)
+        np.add.at(nelem, (np.arange(nelem.shape[0]).reshape(-1,1), idxs), 1)
+        self.atom_info["nelem"] = nelem
+        self.atom_info["elem_list"] = elem_list
+        return nelem
+    def subtract_elem_const(self, elem_const):
+        # assert "elem_const" not in self.atom_info, \
+        #     "subtract_elem_const has been done. The method should not be executed twice."
+        econst = (self.atom_info["nelem"] @ elem_const).reshape(self.nframes, 1)
+        self.data_ec -= econst
+        self.t_data["lb_e"] -= econst
+        self.atom_info["elem_const"] = elem_const
+    def revert_elem_const(self):
+        # assert "elem_const" not in self.atom_info, \
+        #     "subtract_elem_const has been done. The method should not be executed twice."
+        if "elem_const" not in self.atom_info:
+            return
+        elem_const = self.atom_info.pop("elem_const")
+        econst = (self.atom_info["nelem"] @ elem_const).reshape(self.nframes, 1)
+        self.data_ec += econst
+        self.t_data["lb_e"] += econst
+class GroupReader(object) :
+    def __init__ (self, path_list, batch_size=1, group_batch=1, extra_label=True, **kwargs):
+        if isinstance(path_list, str):
+            path_list = [path_list]
+        self.path_list = path_list
+        self.batch_size = batch_size
+        # init system readers
+        Reader_class = (Reader if extra_label 
+            and isinstance(kwargs.get('d_name', "dm_eig"), str) 
+            else SimpleReader)
+        self.readers = []
+        self.nframes = []
+        for ipath in self.path_list :
+            ireader = Reader_class(ipath, batch_size, **kwargs)
+            if ireader.get_nframes() == 0:
+                print('# ignore empty dataset:', ipath, file=sys.stderr)
+                continue
+            self.readers.append(ireader)
+            self.nframes.append(ireader.get_nframes())
+        if not self.readers:
+            raise RuntimeError("No system is avaliable")
+        self.nsystems = len(self.readers)
+        data_keys = self.readers[0].sample_all().keys()
+        print(f"# load {self.nsystems} systems with fields {set(data_keys)}")
+        # probability of each system
+        self.ndesc = self.readers[0].ndesc
+        self.sys_prob = [float(ii) for ii in self.nframes] / np.sum(self.nframes)
+        self.group_batch = max(group_batch, 1)
+        if self.group_batch > 1:
+            self.group_dict = {}
+            # self.group_index = {}
+            for idx, r in enumerate(self.readers):
+                shape = (r.natm, getattr(r, "neg", None))
+                if shape in self.group_dict:
+                    self.group_dict[shape].append(r)
+                    # self.group_index[shape].append(idx)
+                else:
+                    self.group_dict[shape] = [r]
+                    # self.group_index[shape] = [idx]
+            self.group_prob = {n: sum(r.nframes for r in r_list) / sum(self.nframes)
+                                for n, r_list in self.group_dict.items()}
+            self.batch_prob_raw = {n: [r.nframes / r.batch_size for r in r_list] 
+                                for n, r_list in self.group_dict.items()}
+            self.batch_prob = {n: p / np.sum(p) for n, p in self.batch_prob_raw.items()}
+        self._sample_used = 0
+    def __iter__(self):
+        return self
+    def __next__(self):
+        if self._sample_used > self.get_train_size():
+            self._sample_used = 0
+            raise StopIteration
+        sample = self.sample_train() if self.group_batch == 1 else self.sample_train_group()
+        self._sample_used += sample["lb_e"].shape[0]
+        return sample
+    def sample_idx(self) :
+        return np.random.choice(np.arange(self.nsystems), p=self.sys_prob)
+    def sample_train(self, idx=None) :
+        if idx is None:
+            idx = self.sample_idx()
+        return \
+            self.readers[idx].sample_train()
+    def sample_train_group(self):
+        cidx = np.random.choice(len(self.group_prob), p=list(self.group_prob.values()))
+        cshape = list(self.group_prob.keys())[cidx]
+        cgrp = self.group_dict[cshape]
+        csys = np.random.choice(cgrp, self.group_batch, p=self.batch_prob[cshape])
+        batch = concat_batch([s.sample_train() for s in csys], dim=0)
+        return batch
+    def sample_all(self, idx=None) :
+        if idx is None:
+            idx = self.sample_idx()
+        return \
+            self.readers[idx].sample_all()
+    def sample_all_batch(self, idx=None):
+        if idx is not None:
+            all_data = self.sample_all(idx)
+            size = self.batch_size * self.group_batch
+            yield from split_batch(all_data, size, dim=0)
+        else:
+            for i in range(self.nsystems):
+                yield from self.sample_all_batch(i)
+    def get_train_size(self) :
+        return np.sum(self.nframes)
+    def get_batch_size(self) :
+        return self.batch_size
+    def compute_data_stat(self, symm_sections=None):
+        all_dm = np.concatenate([r.data_dm.reshape(-1,r.ndesc) for r in self.readers])
+        if symm_sections is None:
+            all_mean, all_std = all_dm.mean(0), all_dm.std(0)
+        else:
+            assert sum(symm_sections) == all_dm.shape[-1]
+            dm_shells = np.split(all_dm, np.cumsum(symm_sections)[:-1], axis=-1)
+            mean_shells = [d.mean().repeat(s) for d, s in zip(dm_shells, symm_sections)]
+            std_shells = [d.std().repeat(s) for d, s in zip(dm_shells, symm_sections)]
+            all_mean = np.concatenate(mean_shells, axis=-1)
+            all_std = np.concatenate(std_shells, axis=-1)
+        return all_mean, all_std
+    def compute_prefitting(self, shift=None, scale=None, ridge_alpha=1e-8, symm_sections=None):
+        if shift is None or scale is None:
+            all_mean, all_std = self.compute_data_stat(symm_sections=symm_sections)
+            if shift is None:
+                shift = all_mean
+            if scale is None:
+                scale = all_std
+        all_sdm = np.concatenate([((r.data_dm - shift) / scale).sum(1) for r in self.readers])
+        all_natm = np.concatenate([[float(r.data_dm.shape[1])]*r.data_dm.shape[0] for r in self.readers])
+        if symm_sections is not None: # in this case ridge alpha cannot be 0
+            assert sum(symm_sections) == all_sdm.shape[-1]
+            sdm_shells = np.split(all_sdm, np.cumsum(symm_sections)[:-1], axis=-1)
+            all_sdm = np.stack([d.sum(-1) for d in sdm_shells], axis=-1)
+        # build feature matrix
+        X = np.concatenate([all_sdm, all_natm.reshape(-1,1)], -1)
+        y = np.concatenate([r.data_ec for r in self.readers])
+        I = np.identity(X.shape[1])
+        I[-1,-1] = 0 # do not punish the bias term
+        # solve ridge reg
+        coef = np.linalg.solve(X.T @ X + ridge_alpha * I, X.T @ y).reshape(-1)
+        weight, bias = coef[:-1], coef[-1]
+        if symm_sections is not None:
+            weight = np.concatenate([w.repeat(s) for w, s in zip(weight, symm_sections)], axis=-1)
+        return weight, bias
+    def collect_elems(self, elem_list=None):
+        if elem_list is None:
+            elem_list = np.array(sorted(set.union(*[
+                set(r.atom_info["elems"].flatten()) for r in self.readers
+            ])))
+        for rd in self.readers:
+            rd.collect_elems(elem_list)
+        return elem_list
+    def compute_elem_const(self, ridge_alpha=0.):
+        elem_list = self.collect_elems()
+        all_nelem = np.concatenate([r.atom_info["nelem"] for r in self.readers])
+        all_ec = np.concatenate([r.data_ec for r in self.readers])
+        # lex sort by nelem
+        lexidx = np.lexsort(all_nelem.T)
+        all_nelem = all_nelem[lexidx]
+        all_ec = all_ec[lexidx]
+        # group by nelem
+        _, sidx = np.unique(all_nelem, return_index=True, axis=0)
+        sidx = np.sort(sidx)
+        grp_nelem = all_nelem[sidx]
+        grp_ec = np.array(list(map(np.mean, np.split(all_ec, sidx[1:]))))
+        if ridge_alpha <= 0:
+            elem_const, _res, _rank, _sing = np.linalg.lstsq(grp_nelem, grp_ec, None)
+        else:
+            I = np.identity(grp_nelem.shape[1])
+            elem_const = np.linalg.solve(
+                grp_nelem.T @ grp_nelem + ridge_alpha * I, grp_nelem.T @ grp_ec)
+        return elem_list.reshape(-1), elem_const.reshape(-1)
+    def subtract_elem_const(self, elem_const):
+        for rd in self.readers:
+            rd.subtract_elem_const(elem_const)
+    def revert_elem_const(self):
+        for rd in self.readers:
+            rd.revert_elem_const()
+class SimpleReader(object):
+    def __init__(self, data_path, batch_size, 
+                 e_name="l_e_delta", d_name="dm_eig", 
+                 conv_filter=True, conv_name="conv", **kwargs):
+        # copy from config
+        self.data_path = data_path
+        self.batch_size = batch_size
+        self.e_name = e_name
+        self.d_name = d_name if isinstance(d_name, (list, tuple)) else [d_name]
+        self.c_filter = conv_filter
+        self.c_name = conv_name
+        self.load_meta()
+        self.prepare()
+    def load_meta(self):
+        try:
+            sys_meta = np.loadtxt(os.path.join(self.data_path,'system.raw'), dtype = int).reshape([-1])
+            self.natm = sys_meta[0]
+            self.nproj = sys_meta[-1]
+        except:
+            print('#', self.data_path, f"no system.raw, infer meta from data", file=sys.stderr)
+            sys_shape = np.load(os.path.join(self.data_path, f'{self.d_name[0]}.npy')).shape
+            assert len(sys_shape) == 3, \
+                f"{self.d_name[0]} has to be an order-3 array with shape [nframes, natom, nproj]"
+            self.natm = sys_shape[1]
+            self.nproj = sys_shape[2]
+    def prepare(self):
+        self.index_count_all = 0
+        data_ec = np.load(os.path.join(self.data_path,f'{self.e_name}.npy')).reshape([-1, 1])
+        raw_nframes = data_ec.shape[0]
+        data_dm = np.concatenate(
+            [np.load(os.path.join(self.data_path,f'{dn}.npy'))\
+               .reshape([raw_nframes, self.natm, -1])
+            for dn in self.d_name], 
+            axis=-1)
+        if self.c_filter:
+            conv = np.load(os.path.join(self.data_path,f'{self.c_name}.npy')).reshape(raw_nframes)
+        else:
+            conv = np.ones(raw_nframes, dtype=bool)
+        self.data_ec = data_ec[conv]
+        self.data_dm = data_dm[conv]
+        self.nframes = conv.sum()
+        self.ndesc = self.data_dm.shape[-1]
+        # print(np.shape(self.inputs_train))
+        if self.nframes < self.batch_size:
+            self.batch_size = self.nframes
+            print('#', self.data_path, f"reset batch size to {self.batch_size}", file=sys.stderr)
+    def sample_train(self):
+        if self.nframes == self.batch_size == 1:
+            return self.sample_all()
+        self.index_count_all += self.batch_size
+        if self.index_count_all > self.nframes:
+            # shuffle the data
+            self.index_count_all = self.batch_size
+            ind = np.random.choice(self.nframes, self.nframes, replace=False)
+            self.data_ec = self.data_ec[ind]
+            self.data_dm = self.data_dm[ind]
+        ind = np.arange(self.index_count_all - self.batch_size, self.index_count_all)
+        return {
+            "lb_e": torch.from_numpy(self.data_ec[ind]), 
+            "eig": torch.from_numpy(self.data_dm[ind])
+        }
+    def sample_all(self) :
+        return {
+            "lb_e": torch.from_numpy(self.data_ec), 
+            "eig": torch.from_numpy(self.data_dm)
+        }
+    def get_train_size(self) :
+        return self.nframes
+    def get_batch_size(self) :
+        return self.batch_size
+    def get_nframes(self) :
+        return self.nframes
\ No newline at end of file
--- a/deepks/model/test.py
+++ b/deepks/model/test.py
+import os
+import numpy as np
+import torch
+import torch.nn as nn
+try:
+    import deepks
+except ImportError as e:
+    import sys
+    sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/../../")
+from deepks.model.model import CorrNet
+from deepks.model.reader import GroupReader
+from deepks.utils import load_yaml, load_dirs, check_list
+DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+def test(model, g_reader, dump_prefix="test", group=False):
+    model.eval()
+    loss_fn=nn.MSELoss()
+    label_list = []
+    pred_list = []
+    for i in range(g_reader.nsystems):
+        sample = g_reader.sample_all(i)
+        nframes = sample["lb_e"].shape[0]
+        sample = {k: v.to(DEVICE, non_blocking=True) for k, v in sample.items()}
+        label, data = sample["lb_e"], sample["eig"]
+        pred = model(data)
+        error = torch.sqrt(loss_fn(pred, label))
+        error_np = error.item()
+        label_np = label.cpu().numpy().reshape(nframes, -1).sum(axis=1)
+        pred_np = pred.detach().cpu().numpy().reshape(nframes, -1).sum(axis=1)
+        error_l1 = np.mean(np.abs(label_np - pred_np))
+        label_list.append(label_np)
+        pred_list.append(pred_np)
+        if not group and dump_prefix is not None:
+            nd = max(len(str(g_reader.nsystems)), 2)
+            dump_res = np.stack([label_np, pred_np], axis=1)
+            header = f"{g_reader.path_list[i]}\nmean l1 error: {error_l1}\nmean l2 error: {error_np}\nreal_ene  pred_ene"
+            filename = f"{dump_prefix}.{i:0{nd}}.out"
+            np.savetxt(filename, dump_res, header=header)
+            # print(f"system {i} finished")
+    all_label = np.concatenate(label_list, axis=0)
+    all_pred = np.concatenate(pred_list, axis=0)
+    all_err_l1 = np.mean(np.abs(all_label - all_pred))
+    all_err_l2 = np.sqrt(np.mean((all_label - all_pred) ** 2))
+    info = f"all systems mean l1 error: {all_err_l1}\nall systems mean l2 error: {all_err_l2}"
+    print(info)
+    if dump_prefix is not None and group:
+        np.savetxt(f"{dump_prefix}.out", np.stack([all_label, all_pred], axis=1), 
+                   header=info + "\nreal_ene  pred_ene")
+    return all_err_l1, all_err_l2
+def main(data_paths, model_file="model.pth", 
+         output_prefix='test', group=False,
+         e_name='l_e_delta', d_name=['dm_eig']):
+    data_paths = load_dirs(data_paths)
+    if len(d_name) == 1:
+        d_name = d_name[0]
+    g_reader = GroupReader(data_paths, e_name=e_name, d_name=d_name, 
+                           conv_filter=False, extra_label=True)
+    model_file = check_list(model_file)
+    for f in model_file:
+        print(f)
+        p = os.path.dirname(f)
+        model = CorrNet.load(f).double().to(DEVICE)
+        dump = os.path.join(p, output_prefix)
+        dir_name = os.path.dirname(dump)
+        if dir_name:
+            os.makedirs(dir_name, exist_ok=True)
+        if model.elem_table is not None:
+            elist, econst = model.elem_table
+            g_reader.collect_elems(elist)
+            g_reader.subtract_elem_const(econst)
+        test(model, g_reader, dump_prefix=dump, group=group)
+        g_reader.revert_elem_const()
+if __name__ == "__main__":
+    from deepks.main import test_cli as cli
+    cli()
--- a/deepks/model/train.py
+++ b/deepks/model/train.py
+import os
+import sys
+import numpy as np
+from numpy.lib.arraysetops import isin
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from time import time
+try:
+    import deepks
+except ImportError as e:
+    sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/../../")
+from deepks.model.model import CorrNet
+from deepks.model.reader import GroupReader
+from deepks.utils import load_dirs, load_elem_table
+DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+def fit_elem_const(g_reader, test_reader=None, elem_table=None, ridge_alpha=0.):
+    if elem_table is None:
+        elem_table = g_reader.compute_elem_const(ridge_alpha)
+    elem_list, elem_const = elem_table
+    g_reader.collect_elems(elem_list)
+    g_reader.subtract_elem_const(elem_const)
+    if test_reader is not None:
+        test_reader.collect_elems(elem_list)
+        test_reader.subtract_elem_const(elem_const)
+    return elem_table
+def preprocess(model, g_reader, 
+                preshift=True, prescale=False, prescale_sqrt=False, prescale_clip=0,
+                prefit=True, prefit_ridge=10, prefit_trainable=False):
+    shift = model.input_shift.cpu().detach().numpy()
+    scale = model.input_scale.cpu().detach().numpy()
+    symm_sec = model.shell_sec # will be None if no embedding
+    prefit_trainable = prefit_trainable and symm_sec is None # no embedding
+    if preshift or prescale:
+        davg, dstd = g_reader.compute_data_stat(symm_sec)
+        if preshift: 
+            shift = davg
+        if prescale: 
+            scale = dstd
+            if prescale_sqrt: 
+                scale = np.sqrt(scale)
+            if prescale_clip: 
+                scale = scale.clip(prescale_clip)
+        model.set_normalization(shift, scale)
+    if prefit:
+        weight, bias = g_reader.compute_prefitting(
+            shift=shift, scale=scale, 
+            ridge_alpha=prefit_ridge, symm_sections=symm_sec)
+        model.set_prefitting(weight, bias, trainable=prefit_trainable)
+def make_loss(cap=None, shrink=None, reduction="mean"):
+    def loss_fn(input, target):
+        diff = target - input
+        if shrink and shrink > 0:
+            diff = F.softshrink(diff, shrink)
+        sqdf = diff ** 2
+        if cap and cap > 0:
+            abdf = diff.abs()
+            sqdf = torch.where(abdf < cap, sqdf, cap * (2*abdf - cap))
+        if reduction is None or reduction.lower() == "none":
+            return sqdf
+        elif reduction.lower() == "mean":
+            return sqdf.mean()
+        elif reduction.lower() == "sum":
+            return sqdf.sum()
+        elif reduction.lower() in ("batch", "bmean"):
+            return sqdf.sum() / sqdf.shape[0]
+        else:
+            raise ValueError(f"{reduction} is not a valid reduction type")
+    return loss_fn
+# equiv to nn.MSELoss()
+L2LOSS = make_loss(cap=None, shrink=None, reduction="mean")
+class Evaluator:
+    def __init__(self,
+                 energy_factor=1., force_factor=0., 
+                 density_factor=0., grad_penalty=0., 
+                 energy_lossfn=None, force_lossfn=None):
+        # energy term
+        if energy_lossfn is None:
+            energy_lossfn = {}
+        if isinstance(energy_lossfn, dict):
+            energy_lossfn = make_loss(**energy_lossfn)
+        self.e_factor = energy_factor
+        self.e_lossfn = energy_lossfn
+        # force term
+        if force_lossfn is None:
+            force_lossfn = {}
+        if isinstance(force_lossfn, dict):
+            force_lossfn = make_loss(**force_lossfn)
+        self.f_factor = force_factor
+        self.f_lossfn = force_lossfn
+        # coulomb term of dm; requires head gradient
+        self.d_factor = density_factor
+        # gradient penalty, not very useful
+        self.g_penalty = grad_penalty
+    def __call__(self, model, sample):
+        _dref = next(model.parameters())
+        tot_loss = 0.
+        sample = {k: v.to(_dref, non_blocking=True) for k, v in sample.items()}
+        e_label, eig = sample["lb_e"], sample["eig"]
+        nframe = e_label.shape[0]
+        requires_grad =  ( (self.f_factor > 0 and "lb_f" in sample) 
+                        or (self.d_factor > 0 and "gldv" in sample)
+                        or self.g_penalty > 0)
+        eig.requires_grad_(requires_grad)
+        # begin the calculation
+        e_pred = model(eig)
+        tot_loss = tot_loss + self.e_factor * self.e_lossfn(e_pred, e_label)
+        if requires_grad:
+            [gev] = torch.autograd.grad(e_pred, eig, 
+                        grad_outputs=torch.ones_like(e_pred),
+                        retain_graph=True, create_graph=True, only_inputs=True)
+            # for now always use pure l2 loss for gradient penalty
+            if self.g_penalty > 0 and "eg0" in sample:
+                eg_base, gveg = sample["eg0"], sample["gveg"]
+                eg_tot = torch.einsum('...apg,...ap->...g', gveg, gev) + eg_base
+                tot_loss = tot_loss + self.g_penalty * eg_tot.pow(2).mean(0).sum()
+            # optional force calculation
+            if self.f_factor > 0 and "lb_f" in sample:
+                f_label, gvx = sample["lb_f"], sample["gvx"]
+                f_pred = - torch.einsum("...bxap,...ap->...bx", gvx, gev)
+                tot_loss = tot_loss + self.f_factor * self.f_lossfn(f_pred, f_label)
+            # density loss with fix head grad
+            if self.d_factor > 0 and "gldv" in sample:
+                gldv = sample["gldv"]
+                tot_loss = tot_loss + self.d_factor * (gldv * gev).mean(0).sum()
+        return tot_loss
+def train(model, g_reader, n_epoch=1000, test_reader=None, *,
+          energy_factor=1., force_factor=0., density_factor=0.,
+          energy_loss=None, force_loss=None, grad_penalty=0.,
+          start_lr=0.001, decay_steps=100, decay_rate=0.96, stop_lr=None,
+          weight_decay=0.,  fix_embedding=False,
+          display_epoch=100, ckpt_file="model.pth",
+          graph_file=None, device=DEVICE):
+    model = model.to(device)
+    model.eval()
+    print("# working on device:", device)
+    if test_reader is None:
+        test_reader = g_reader
+    # fix parameters if needed
+    if fix_embedding and model.embedder is not None:
+        model.embedder.requires_grad_(False)
+    # set up optimizer and lr scheduler
+    optimizer = optim.Adam(model.parameters(), lr=start_lr, weight_decay=weight_decay)
+    if stop_lr is not None:
+        decay_rate = (stop_lr / start_lr) ** (1 / (n_epoch // decay_steps))
+        print(f"# resetting decay_rate: {decay_rate:.4f} "
+              + f"to satisfy stop_lr: {stop_lr:.2e}")
+    scheduler = optim.lr_scheduler.StepLR(optimizer, decay_steps, decay_rate)
+    # make evaluators for training
+    evaluator = Evaluator(energy_factor=energy_factor, force_factor=force_factor, 
+                          energy_lossfn=energy_loss, force_lossfn=force_loss,
+                          density_factor=density_factor, grad_penalty=grad_penalty)
+    # make test evaluator that only returns l2loss of energy
+    test_eval = Evaluator(energy_factor=1., energy_lossfn=L2LOSS, 
+                          force_factor=0., density_factor=0., grad_penalty=0.)
+    print("# epoch      trn_err   tst_err        lr  trn_time  tst_time ")
+    tic = time()
+    trn_loss = np.mean([evaluator(model, batch).item() 
+                    for batch in g_reader.sample_all_batch()])
+    tst_loss = np.mean([test_eval(model, batch).item() 
+                    for batch in test_reader.sample_all_batch()])
+    tst_time = time() - tic
+    print(f"  {0:<8d}  {np.sqrt(np.abs(trn_loss)):>.2e}  {np.sqrt(np.abs(tst_loss)):>.2e}"
+          f"  {start_lr:>.2e}  {0:>8.2f}  {tst_time:>8.2f}")
+    for epoch in range(1, n_epoch+1):
+        tic = time()
+        loss_list = []
+        for sample in g_reader:
+            model.train()
+            optimizer.zero_grad()
+            loss = evaluator(model, sample)
+            loss.backward()
+            optimizer.step()
+            loss_list.append(loss.item())
+        scheduler.step()
+        if epoch % display_epoch == 0:
+            model.eval()
+            trn_loss = np.mean(loss_list)
+            trn_time = time() - tic
+            tic = time()
+            tst_loss = np.mean([test_eval(model, batch).item() 
+                            for batch in test_reader.sample_all_batch()])
+            tst_time = time() - tic
+            print(f"  {epoch:<8d}  {np.sqrt(np.abs(trn_loss)):>.2e}  {np.sqrt(np.abs(tst_loss)):>.2e}"
+                  f"  {scheduler.get_last_lr()[0]:>.2e}  {trn_time:>8.2f}  {tst_time:8.2f}")
+            if ckpt_file:
+                model.save(ckpt_file)
+    if ckpt_file:
+        model.save(ckpt_file)
+    if graph_file:
+        model.compile_save(graph_file)
+def main(train_paths, test_paths=None,
+         restart=None, ckpt_file=None, 
+         model_args=None, data_args=None, 
+         preprocess_args=None, train_args=None, 
+         proj_basis=None, fit_elem=False, 
+         seed=None, device=None):
+    if seed is None: 
+        seed = np.random.randint(0, 2**32)
+    print(f'# using seed: {seed}')
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if model_args is None: model_args = {}
+    if data_args is None: data_args = {}
+    if preprocess_args is None: preprocess_args = {}
+    if train_args is None: train_args = {}
+    if proj_basis is not None:
+        model_args["proj_basis"] = proj_basis
+    if ckpt_file is not None:
+        train_args["ckpt_file"] = ckpt_file
+    if device is not None:
+        train_args["device"] = device
+    train_paths = load_dirs(train_paths)
+    # print(f'# training with {len(train_paths)} system(s)')
+    g_reader = GroupReader(train_paths, **data_args)
+    if test_paths is not None:
+        test_paths = load_dirs(test_paths)
+        # print(f'# testing with {len(test_paths)} system(s)')
+        test_reader = GroupReader(test_paths, **data_args)
+    else:
+        print('# testing with training set')
+        test_reader = None
+    if restart is not None:
+        model = CorrNet.load(restart)
+        if model.elem_table is not None:
+            fit_elem_const(g_reader, test_reader, model.elem_table)
+    else:
+        input_dim = g_reader.ndesc
+        if model_args.get("input_dim", input_dim) != input_dim:
+            print(f"# `input_dim` in `model_args` does not match data",
+                  f"({input_dim}).", "Use the one in data.", file=sys.stderr)
+        model_args["input_dim"] = input_dim
+        if fit_elem:
+            elem_table = model_args.get("elem_table", None)
+            if isinstance(elem_table, str):
+                elem_table = load_elem_table(elem_table)
+            elem_table = fit_elem_const(g_reader, test_reader, elem_table)
+            model_args["elem_table"] = elem_table
+        model = CorrNet(**model_args).double()
+    preprocess(model, g_reader, **preprocess_args)
+    train(model, g_reader, test_reader=test_reader, **train_args)
+if __name__ == "__main__":
+    from deepks.main import train_cli as cli
+    cli()
\ No newline at end of file
--- a/deepks/scf/__init__.py
+++ b/deepks/scf/__init__.py
+__all__ = [
+    "scf",
+    "grad",
+    "run",
+    "stats",
+    "fields",
+    "penalty",
+]
+def __getattr__(name):
+    from importlib import import_module
+    if name in __all__:
+        return import_module("." + name, __name__)
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+def DSCF(mol, model, xc="HF", **kwargs):
+    """A wrap function to create NN SCF object (RDSCF or UDSCF)"""
+    from .scf import RDSCF, UDSCF
+    if mol.spin == 0:
+        return RDSCF(mol, model, xc, **kwargs)
+    else:
+        return UDSCF(mol, model, xc, **kwargs)
+DeepSCF = DSCF
\ No newline at end of file
--- a/deepks/scf/__main__.py
+++ b/deepks/scf/__main__.py
+import os
+import sys
+try:
+    import deepks
+except ImportError as e:
+    sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/../")
+from deepks.main import scf_cli
+if __name__ == "__main__":
+    scf_cli()
\ No newline at end of file
--- a/deepks/scf/_old_grad.py
+++ b/deepks/scf/_old_grad.py
+import torch
+import numpy as np
+from pyscf import gto, lib
+from pyscf.grad import rks as grad_base
+class Gradients(grad_base.Gradients):
+    # all variables and functions start with "t_" are torch related.
+    # convention in einsum:
+    #   i,j: orbital
+    #   a,b: atom
+    #   p,q: projected basis on atom
+    #   r,s: mol basis in pyscf
+    #   x  : space component of gradient
+    #   v  : eigen values of projected dm
+    """Analytical nuclear gradient for our SCF model"""
+    def __init__(self, mf):
+        super().__init__(mf)
+        # prepare integrals for projection and derivative
+        self.prepare_integrals()
+        # add a field to memorize the pulay term in ec
+        self.dec = None
+        self._keys.update(self.__dict__.keys())
+    def prepare_integrals(self):
+        mf = self.base
+        self._pmol = mf._pmol
+        # < mol_ao | alpha^I_rlm > by shells
+        self._t_ovlp_shells = mf._t_ovlp_shells
+        # \partial E / \partial (D^I_rl)_mm' by shells
+        self._t_gedm_shells = _t_get_grad_dms(mf) if mf.mo_coeff is not None else None
+        # < \nabla mol_ao | alpha^I_rlm >
+        self._t_proj_ipovlp = torch.from_numpy(
+            mf.proj_intor("int1e_ipovlp")).double().to(mf.device)
+    def extra_force(self, atom_id, envs):
+        """We calculate the pulay force caused by our atomic projection here"""
+        de0 = super().extra_force(atom_id, envs)
+        dm = envs["dm0"]
+        t_dm = torch.from_numpy(dm).double().to(self.base.device)
+        t_dec = self._t_get_pulay(atom_id, t_dm)
+        dec = t_dec.detach().cpu().numpy()
+        # memorize dec results for calculate hf grad
+        if self.dec is None:
+            self.dec = np.zeros((len(envs["atmlst"]), 3))
+        self.dec[envs["k"]] = dec
+        # return summed grads
+        return de0 + dec
+    def kernel(self, *args, **kwargs):
+        # do nothing additional to the original one but symmetrizing dec
+        # return exact the same thing
+        de = super().kernel(*args, **kwargs)
+        if self.mol.symmetry:
+            self.dec = self.symmetrize(self.dec, self.atmlst)
+        return de
+    def get_base(self):
+        """return the grad given by raw Hartree Fock Hamiltonian under current dm"""
+        assert self.de is not None and self.dec is not None
+        return self.de - self.dec
+    def _t_get_pulay(self, atom_id, t_dm):
+        """calculate pulay force in torch tensor"""
+        if self._t_gedm_shells is None:
+            self._t_gedm_shells = _t_get_grad_dms(self.base)
+        # mask to select specifc atom contribution from ipovlp
+        mask = self._t_make_mask(atom_id)
+        # \partial < mol_ao | aplha^I_rlm' > / \partial X^J
+        atom_ipovlp = (self._t_proj_ipovlp * mask).reshape(3, self.mol.nao, self.mol.natm, -1)
+        # grad X^I w.r.t atomic overlap coeff by shells
+        govx_shells = torch.split(atom_ipovlp, self.base._shell_sec, -1)
+        # \partial (D^I_rl)_mm' / \partial X^J by shells, lack of symmetrize
+        gdmx_shells = [torch.einsum('xrap,rs,saq->xapq', govx, t_dm, po)
+                            for govx, po in zip(govx_shells, self._t_ovlp_shells)]
+        # \partial E / \partial X^J by shells
+        gex_shells = [torch.einsum("xapq,apq->x", gdmx + gdmx.transpose(-1,-2), gedm)
+                            for gdmx, gedm in zip(gdmx_shells, self._t_gedm_shells)]
+        # total pulay term in gradient
+        return torch.stack(gex_shells, 0).sum(0)
+    def _t_make_mask(self, atom_id):
+        mask = torch.from_numpy(
+                   make_mask(self.mol, self._pmol, atom_id)
+               ).double().to(self.base.device)
+        return mask
+    def make_grad_pdm_x(self, dm=None, flatten=False):
+        if dm is None:
+            dm = self.base.make_rdm1()
+        t_dm = torch.from_numpy(dm).double().to(self.base.device)
+        all_gdmx_shells = self._t_make_grad_pdm_x(t_dm)
+        if not flatten:
+            return [s.detach().cpu().numpy() for s in all_gdmx_shells]
+        else:
+            return torch.cat([s.flatten(-2) for s in all_gdmx_shells], 
+                             dim=-1).detach().cpu().numpy()
+    def _t_make_grad_pdm_x(self, t_dm):
+        atom_gdmx_shells = []
+        for atom_id in range(self.mol.natm):
+            mask = self._t_make_mask(atom_id)
+            atom_ipovlp = (self._t_proj_ipovlp * mask).reshape(3, self.mol.nao, self.mol.natm, -1)
+            govx_shells = torch.split(atom_ipovlp, self.base._shell_sec, -1)
+            gdmx_shells = [torch.einsum('xrap,rs,saq->xapq', govx, t_dm, po)
+                                for govx, po in zip(govx_shells, self._t_ovlp_shells)]
+            atom_gdmx_shells.append([gdmx + gdmx.transpose(-1,-2) for gdmx in gdmx_shells])
+        # [natom (deriv atom) x 3 (xyz) x natom (proj atom) x nsph (1|3|5) x nsph] list
+        all_gdmx_shells = [torch.stack(s, dim=0) for s in zip(*atom_gdmx_shells)]
+        return all_gdmx_shells
+    def make_grad_eig_x(self, dm=None):
+        if dm is None:
+            dm = self.base.make_rdm1()
+        t_dm = torch.from_numpy(dm).double().to(self.base.device)
+        return self._t_make_grad_eig_x(t_dm).detach().cpu().numpy()
+    def _t_make_grad_eig_x(self, t_dm):
+        # v stands for eigen values
+        shell_pdm = [torch.einsum('rap,rs,saq->apq', po, t_dm, po).requires_grad_(True)
+                        for po in self._t_ovlp_shells]
+        calc_eig = lambda dm: torch.symeig(dm, True)[0]
+        shell_gvdm = [get_batch_jacobian(calc_eig, dm, dm.shape[-1]) 
+                        for dm in shell_pdm]
+        shell_gdmx = self._t_make_grad_pdm_x(t_dm)
+        shell_gvx = [torch.einsum("bxapq,avpq->bxav", gdmx, gvdm) 
+                        for gdmx, gvdm in zip(shell_gdmx, shell_gvdm)]
+        return torch.cat(shell_gvx, dim=-1)
+    def as_scanner(self):
+        scanner = super().as_scanner()
+        # make a new version of call method
+        class NewScanner(type(scanner)):
+            def __call__(self, mol_or_geom, **kwargs):
+                if isinstance(mol_or_geom, gto.Mole):
+                    mol = mol_or_geom
+                else:
+                    mol = self.mol.set_geom_(mol_or_geom, inplace=False)
+                mf_scanner = self.base
+                e_tot = mf_scanner(mol)
+                self.mol = mol
+                if getattr(self, 'grids', None):
+                    self.grids.reset(mol)
+                # adding the following line to refresh integrals
+                self.prepare_integrals()
+                de = self.kernel(**kwargs)
+                return e_tot, de
+        # hecking the old scanner's method, bind the new one
+        scanner.__class__ = NewScanner
+        return scanner
+def make_mask(mol1, mol2, atom_id):
+    mask = np.zeros((mol1.nao, mol2.nao))
+    bg1, ed1 = mol1.aoslice_by_atom()[atom_id, 2:]
+    bg2, ed2 = mol2.aoslice_by_atom()[atom_id, 2:]
+    mask[bg1:ed1, :] -= 1
+    mask[:, bg2:ed2] += 1
+    return mask
+def _t_get_grad_dms(mf, dm=None):
+    # calculate \partial E / \partial (D^I_rl)_mm' by shells
+    if dm is None:
+        dm = mf.make_rdm1()
+    t_dm = torch.from_numpy(dm).double().to(mf.device)
+    proj_dms = [torch.einsum('rap,rs,saq->apq', po, t_dm, po).requires_grad_(True)
+                    for po in mf._t_ovlp_shells]
+    if mf.net is None:
+        return [torch.zeros_like(pdm) for pdm in proj_dms]
+    proj_eigs = [torch.symeig(dm, eigenvectors=True)[0]
+                    for dm in proj_dms]
+    ceig = torch.cat(proj_eigs, dim=-1).unsqueeze(0) # 1 x natoms x nproj
+    ec = mf.net(ceig)
+    grad_dms = torch.autograd.grad(ec, proj_dms)
+    return grad_dms
+def get_batch_jacobian(f, x, noutputs):
+    nindim = len(x.shape)-1
+    x = x.unsqueeze(1) # b, 1 ,*in_dim
+    n = x.shape[0]
+    x = x.repeat(1, noutputs, *[1]*nindim) # b, out_dim, *in_dim
+    x.requires_grad_(True)
+    y = f(x)
+    input_val = torch.eye(noutputs).reshape(1,noutputs, noutputs).repeat(n, 1, 1)
+    return torch.autograd.grad(y, x, input_val)[0]
+# only for testing purpose, not used in code
+def finite_difference(f, x, delta=1e-6):
+    in_shape = x.shape
+    y0 = f(x)
+    out_shape = y0.shape
+    res = np.empty(in_shape + out_shape)
+    for idx in np.ndindex(*in_shape):
+        diff = np.zeros(in_shape)
+        diff[idx] += delta
+        y1 = f(x+diff)
+        res[idx] = (y1-y0) / delta
+    return res
+Grad = Gradients
+# from deepks.scf.scf import DSCF
+# # Inject to SCF class
+# DSCF.Gradients = lib.class_as_method(Gradients)