Commit 6b33aeb8 authored by zhangqha's avatar zhangqha
Browse files

BladeDISC DeePMD code

parents
Pipeline #179 canceled with stages
#!/usr/bin/env python3
from deepmd.descriptor.descriptor import Descriptor
import logging
import os
import glob
import platform
import time
import shutil
import google.protobuf.message
import numpy as np
from packaging.version import Version
from deepmd.env import tf, tfv2
from deepmd.env import get_tf_session_config
from deepmd.env import GLOBAL_TF_FLOAT_PRECISION
from deepmd.env import GLOBAL_ENER_FLOAT_PRECISION
from deepmd.fit import EnerFitting, WFCFitting, PolarFittingLocFrame, PolarFittingSeA, GlobalPolarFittingSeA, DipoleFittingSeA
from deepmd.descriptor import Descriptor
from deepmd.model import EnerModel, WFCModel, DipoleModel, PolarModel, GlobalPolarModel
from deepmd.loss import EnerStdLoss, EnerDipoleLoss, TensorLoss
from deepmd.utils.errors import GraphTooLargeError
from deepmd.utils.learning_rate import LearningRateExp
from deepmd.utils.neighbor_stat import NeighborStat
from deepmd.utils.sess import run_sess
from deepmd.utils.type_embed import TypeEmbedNet
from deepmd.utils.graph import load_graph_def, get_tensor_by_name_from_graph
from deepmd.utils.argcheck import type_embedding_args
from tensorflow.python.client import timeline
from deepmd.env import op_module, TF_VERSION
from deepmd.utils.errors import GraphWithoutTensorError
# load grad of force module
import deepmd.op
from deepmd.common import j_must_have, ClassArg, data_requirement, get_precision
log = logging.getLogger(__name__)
# nvnmd
from deepmd.nvnmd.utils.config import nvnmd_cfg
def _is_subdir(path, directory):
path = os.path.realpath(path)
directory = os.path.realpath(directory)
if path == directory:
return False
relative = os.path.relpath(path, directory) + os.sep
return not relative.startswith(os.pardir + os.sep)
class DPTrainer (object):
def __init__(self,
jdata,
run_opt,
is_compress = False):
self.run_opt = run_opt
self._init_param(jdata)
self.is_compress = is_compress
def _init_param(self, jdata):
# model config
model_param = j_must_have(jdata, 'model')
descrpt_param = j_must_have(model_param, 'descriptor')
fitting_param = j_must_have(model_param, 'fitting_net')
typeebd_param = model_param.get('type_embedding', None)
self.model_param = model_param
self.descrpt_param = descrpt_param
# nvnmd
self.nvnmd_param = jdata.get('nvnmd', {})
nvnmd_cfg.init_from_jdata(self.nvnmd_param)
if nvnmd_cfg.enable:
nvnmd_cfg.init_from_deepmd_input(model_param)
nvnmd_cfg.disp_message()
nvnmd_cfg.save()
# descriptor
try:
descrpt_type = descrpt_param['type']
self.descrpt_type = descrpt_type
except KeyError:
raise KeyError('the type of descriptor should be set by `type`')
if descrpt_param['type'] in ['se_atten']:
descrpt_param['ntypes'] = len(model_param['type_map'])
self.descrpt = Descriptor(**descrpt_param)
# fitting net
fitting_type = fitting_param.get('type', 'ener')
self.fitting_type = fitting_type
fitting_param.pop('type', None)
fitting_param['descrpt'] = self.descrpt
if fitting_type == 'ener':
self.fitting = EnerFitting(**fitting_param)
# elif fitting_type == 'wfc':
# self.fitting = WFCFitting(fitting_param, self.descrpt)
elif fitting_type == 'dipole':
if descrpt_type == 'se_e2_a':
self.fitting = DipoleFittingSeA(**fitting_param)
else :
raise RuntimeError('fitting dipole only supports descrptors: se_e2_a')
elif fitting_type == 'polar':
# if descrpt_type == 'loc_frame':
# self.fitting = PolarFittingLocFrame(fitting_param, self.descrpt)
if descrpt_type == 'se_e2_a':
self.fitting = PolarFittingSeA(**fitting_param)
else :
raise RuntimeError('fitting polar only supports descrptors: loc_frame and se_e2_a')
elif fitting_type == 'global_polar':
if descrpt_type == 'se_e2_a':
self.fitting = GlobalPolarFittingSeA(**fitting_param)
else :
raise RuntimeError('fitting global_polar only supports descrptors: loc_frame and se_e2_a')
else :
raise RuntimeError('unknow fitting type ' + fitting_type)
# type embedding
padding = False
if descrpt_type == 'se_atten':
padding = True
if typeebd_param is not None:
self.typeebd = TypeEmbedNet(
neuron=typeebd_param['neuron'],
resnet_dt=typeebd_param['resnet_dt'],
activation_function=typeebd_param['activation_function'],
precision=typeebd_param['precision'],
trainable=typeebd_param['trainable'],
seed=typeebd_param['seed'],
padding=padding
)
elif descrpt_type == 'se_atten':
default_args = type_embedding_args()
default_args_dict = {i.name: i.default for i in default_args}
self.typeebd = TypeEmbedNet(
neuron=default_args_dict['neuron'],
resnet_dt=default_args_dict['resnet_dt'],
activation_function=None,
precision=default_args_dict['precision'],
trainable=default_args_dict['trainable'],
seed=default_args_dict['seed'],
padding=padding
)
else:
self.typeebd = None
# init model
# infer model type by fitting_type
if fitting_type == 'ener':
self.model = EnerModel(
self.descrpt,
self.fitting,
self.typeebd,
model_param.get('type_map'),
model_param.get('data_stat_nbatch', 10),
model_param.get('data_stat_protect', 1e-2),
model_param.get('use_srtab'),
model_param.get('smin_alpha'),
model_param.get('sw_rmin'),
model_param.get('sw_rmax')
)
# elif fitting_type == 'wfc':
# self.model = WFCModel(model_param, self.descrpt, self.fitting)
elif fitting_type == 'dipole':
self.model = DipoleModel(
self.descrpt,
self.fitting,
model_param.get('type_map'),
model_param.get('data_stat_nbatch', 10),
model_param.get('data_stat_protect', 1e-2)
)
elif fitting_type == 'polar':
self.model = PolarModel(
self.descrpt,
self.fitting,
model_param.get('type_map'),
model_param.get('data_stat_nbatch', 10),
model_param.get('data_stat_protect', 1e-2)
)
elif fitting_type == 'global_polar':
self.model = GlobalPolarModel(
self.descrpt,
self.fitting,
model_param.get('type_map'),
model_param.get('data_stat_nbatch', 10),
model_param.get('data_stat_protect', 1e-2)
)
else :
raise RuntimeError('get unknown fitting type when building model')
# learning rate
lr_param = j_must_have(jdata, 'learning_rate')
scale_by_worker = lr_param.get('scale_by_worker', 'linear')
if scale_by_worker == 'linear':
self.scale_lr_coef = float(self.run_opt.world_size)
elif scale_by_worker == 'sqrt':
self.scale_lr_coef = np.sqrt(self.run_opt.world_size).real
else:
self.scale_lr_coef = 1.
lr_type = lr_param.get('type', 'exp')
if lr_type == 'exp':
self.lr = LearningRateExp(lr_param['start_lr'],
lr_param['stop_lr'],
lr_param['decay_steps'])
else :
raise RuntimeError('unknown learning_rate type ' + lr_type)
# loss
# infer loss type by fitting_type
loss_param = jdata.get('loss', None)
loss_type = loss_param.get('type', 'ener')
if fitting_type == 'ener':
loss_param.pop('type', None)
loss_param['starter_learning_rate'] = self.lr.start_lr()
if loss_type == 'ener':
self.loss = EnerStdLoss(**loss_param)
elif loss_type == 'ener_dipole':
self.loss = EnerDipoleLoss(**loss_param)
else:
raise RuntimeError('unknow loss type')
elif fitting_type == 'wfc':
self.loss = TensorLoss(loss_param,
model = self.model,
tensor_name = 'wfc',
tensor_size = self.model.get_out_size(),
label_name = 'wfc')
elif fitting_type == 'dipole':
self.loss = TensorLoss(loss_param,
model = self.model,
tensor_name = 'dipole',
tensor_size = 3,
label_name = 'dipole')
elif fitting_type == 'polar':
self.loss = TensorLoss(loss_param,
model = self.model,
tensor_name = 'polar',
tensor_size = 9,
label_name = 'polarizability')
elif fitting_type == 'global_polar':
self.loss = TensorLoss(loss_param,
model = self.model,
tensor_name = 'global_polar',
tensor_size = 9,
atomic = False,
label_name = 'polarizability')
else :
raise RuntimeError('get unknown fitting type when building loss function')
# training
tr_data = jdata['training']
self.disp_file = tr_data.get('disp_file', 'lcurve.out')
self.disp_freq = tr_data.get('disp_freq', 1000)
self.save_freq = tr_data.get('save_freq', 1000)
self.save_ckpt = tr_data.get('save_ckpt', 'model.ckpt')
self.display_in_training = tr_data.get('disp_training', True)
self.timing_in_training = tr_data.get('time_training', True)
self.profiling = self.run_opt.is_chief and tr_data.get('profiling', False)
self.profiling_file = tr_data.get('profiling_file', 'timeline.json')
self.enable_profiler = tr_data.get('enable_profiler', False)
self.tensorboard = self.run_opt.is_chief and tr_data.get('tensorboard', False)
self.tensorboard_log_dir = tr_data.get('tensorboard_log_dir', 'log')
self.tensorboard_freq = tr_data.get('tensorboard_freq', 1)
self.mixed_prec = tr_data.get('mixed_precision', None)
if self.mixed_prec is not None:
if (self.mixed_prec['compute_prec'] != 'float16' or self.mixed_prec['output_prec'] != 'float32'):
raise RuntimeError(
"Unsupported mixed precision option [output_prec, compute_prec]: [%s, %s], "
" Supported: [float32, float16], Please set mixed precision option correctly!"
% (self.mixed_prec['output_prec'], self.mixed_prec['compute_prec']))
# self.sys_probs = tr_data['sys_probs']
# self.auto_prob_style = tr_data['auto_prob']
self.useBN = False
if fitting_type == 'ener' and self.fitting.get_numb_fparam() > 0 :
self.numb_fparam = self.fitting.get_numb_fparam()
else :
self.numb_fparam = 0
if tr_data.get("validation_data", None) is not None:
self.valid_numb_batch = tr_data["validation_data"].get("numb_btch", 1)
else:
self.valid_numb_batch = 1
# if init the graph with the frozen model
self.frz_model = None
self.model_type = None
def build (self,
data = None,
stop_batch = 0,
suffix = "") :
self.ntypes = self.model.get_ntypes()
self.stop_batch = stop_batch
if not self.is_compress and data.mixed_type:
assert self.descrpt_type in ['se_atten'], 'Data in mixed_type format must use attention descriptor!'
assert self.fitting_type in ['ener'], 'Data in mixed_type format must use ener fitting!'
if self.numb_fparam > 0 :
log.info("training with %d frame parameter(s)" % self.numb_fparam)
else:
log.info("training without frame parameter")
if not self.is_compress:
# Usually, the type number of the model should be equal to that of the data
# However, nt_model > nt_data should be allowed, since users may only want to
# train using a dataset that only have some of elements
if self.ntypes < data.get_ntypes():
raise ValueError(
"The number of types of the training data is %d, but that of the "
"model is only %d. The latter must be no less than the former. "
"You may need to reset one or both of them. Usually, the former "
"is given by `model/type_map` in the training parameter (if set) "
"or the maximum number in the training data. The latter is given "
"by `model/descriptor/sel` in the training parameter." % (
data.get_ntypes(), self.ntypes
))
self.type_map = data.get_type_map()
self.batch_size = data.get_batch_size()
if self.run_opt.init_mode not in ('init_from_model', 'restart', 'init_from_frz_model'):
# self.saver.restore (in self._init_session) will restore avg and std variables, so data_stat is useless
# init_from_frz_model will restore data_stat variables in `init_variables` method
log.info("data stating... (this step may take long time)")
self.model.data_stat(data)
# config the init_frz_model command
if self.run_opt.init_mode == 'init_from_frz_model':
self._init_from_frz_model()
# neighbor_stat is moved to train.py as duplicated
# TODO: this is a simple fix but we should have a clear
# architecture to call neighbor stat
else :
graph, graph_def = load_graph_def(self.model_param['compress']['model_file'])
self.descrpt.enable_compression(self.model_param['compress']["min_nbor_dist"], self.model_param['compress']['model_file'], self.model_param['compress']['table_config'][0], self.model_param['compress']['table_config'][1], self.model_param['compress']['table_config'][2], self.model_param['compress']['table_config'][3])
self.fitting.init_variables(graph, graph_def)
# for fparam or aparam settings in 'ener' type fitting net
if self.fitting_type == 'ener':
self.fitting.enable_compression(self.model_param['compress']['model_file'])
if self.is_compress or self.model_type == 'compressed_model':
tf.constant("compressed_model", name = 'model_type', dtype = tf.string)
else:
tf.constant("original_model", name = 'model_type', dtype = tf.string)
if self.mixed_prec is not None:
self.descrpt.enable_mixed_precision(self.mixed_prec)
self.fitting.enable_mixed_precision(self.mixed_prec)
self._build_lr()
self._build_network(data, suffix)
self._build_training()
def _build_lr(self):
self._extra_train_ops = []
self.global_step = tf.train.get_or_create_global_step()
self.learning_rate = self.lr.build(self.global_step, self.stop_batch)
log.info("built lr")
def _build_network(self, data, suffix=""):
self.place_holders = {}
if self.is_compress :
for kk in ['coord', 'box']:
self.place_holders[kk] = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], 't_' + kk)
self._get_place_horders(data_requirement)
else :
self._get_place_horders(data.get_data_dict())
self.place_holders['type'] = tf.placeholder(tf.int32, [None], name='t_type')
self.place_holders['natoms_vec'] = tf.placeholder(tf.int32, [self.ntypes+2], name='t_natoms')
self.place_holders['default_mesh'] = tf.placeholder(tf.int32, [None], name='t_mesh')
self.place_holders['is_training'] = tf.placeholder(tf.bool)
self.model_pred\
= self.model.build (self.place_holders['coord'],
self.place_holders['type'],
self.place_holders['natoms_vec'],
self.place_holders['box'],
self.place_holders['default_mesh'],
self.place_holders,
self.frz_model,
suffix = suffix,
reuse = False)
self.l2_l, self.l2_more\
= self.loss.build (self.learning_rate,
self.place_holders['natoms_vec'],
self.model_pred,
self.place_holders,
suffix = "test")
if self.mixed_prec is not None:
self.l2_l = tf.cast(self.l2_l, get_precision(self.mixed_prec['output_prec']))
log.info("built network")
def _build_training(self):
trainable_variables = tf.trainable_variables()
if self.run_opt.is_distrib:
if self.scale_lr_coef > 1.:
log.info('Scale learning rate by coef: %f', self.scale_lr_coef)
optimizer = tf.train.AdamOptimizer(self.learning_rate*self.scale_lr_coef)
else:
optimizer = tf.train.AdamOptimizer(self.learning_rate)
optimizer = self.run_opt._HVD.DistributedOptimizer(optimizer)
else:
optimizer = tf.train.AdamOptimizer(learning_rate = self.learning_rate)
if self.mixed_prec is not None:
_TF_VERSION = Version(TF_VERSION)
# check the TF_VERSION, when TF < 1.12, mixed precision is not allowed
if _TF_VERSION < Version('1.14.0'):
raise RuntimeError("TensorFlow version %s is not compatible with the mixed precision setting. Please consider upgrading your TF version!" % TF_VERSION)
elif _TF_VERSION < Version('2.4.0'):
optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer)
else:
optimizer = tf.mixed_precision.enable_mixed_precision_graph_rewrite(optimizer)
apply_op = optimizer.minimize(loss=self.l2_l,
global_step=self.global_step,
var_list=trainable_variables,
name='train_step')
train_ops = [apply_op] + self._extra_train_ops
self.train_op = tf.group(*train_ops)
log.info("built training")
def _init_session(self):
config = get_tf_session_config()
device, idx = self.run_opt.my_device.split(":", 1)
if device == "gpu":
config.gpu_options.visible_device_list = idx
self.sess = tf.Session(config=config)
# Initializes or restore global variables
init_op = tf.global_variables_initializer()
if self.run_opt.is_chief:
self.saver = tf.train.Saver(save_relative_paths=True)
if self.run_opt.init_mode == 'init_from_scratch' :
log.info("initialize model from scratch")
run_sess(self.sess, init_op)
if not self.is_compress:
fp = open(self.disp_file, "w")
fp.close ()
elif self.run_opt.init_mode == 'init_from_model' :
log.info("initialize from model %s" % self.run_opt.init_model)
run_sess(self.sess, init_op)
self.saver.restore (self.sess, self.run_opt.init_model)
run_sess(self.sess, self.global_step.assign(0))
fp = open(self.disp_file, "w")
fp.close ()
elif self.run_opt.init_mode == 'restart' :
log.info("restart from model %s" % self.run_opt.restart)
run_sess(self.sess, init_op)
self.saver.restore (self.sess, self.run_opt.restart)
elif self.run_opt.init_mode == 'init_from_frz_model' :
log.info("initialize training from the frozen model")
run_sess(self.sess, init_op)
fp = open(self.disp_file, "w")
fp.close ()
else :
raise RuntimeError ("unkown init mode")
else:
run_sess(self.sess, init_op)
self.saver = None
# Ensure variable consistency among tasks when training starts
if self.run_opt.is_distrib:
bcast_op = self.run_opt._HVD.broadcast_global_variables(0)
if self.run_opt.is_chief:
log.info('broadcast global variables to other tasks')
else:
log.info('receive global variables from task#0')
run_sess(self.sess, bcast_op)
def train (self, train_data = None, valid_data=None) :
# if valid_data is None: # no validation set specified.
# valid_data = train_data # using training set as validation set.
stop_batch = self.stop_batch
self._init_session()
# Before data shard is enabled, only cheif do evaluation and record it
# self.print_head()
fp = None
if self.run_opt.is_chief :
fp = open(self.disp_file, "a")
cur_batch = run_sess(self.sess, self.global_step)
is_first_step = True
self.cur_batch = cur_batch
log.info("start training at lr %.2e (== %.2e), decay_step %d, decay_rate %f, final lr will be %.2e" %
(run_sess(self.sess, self.learning_rate),
self.lr.value(cur_batch),
self.lr.decay_steps_,
self.lr.decay_rate_,
self.lr.value(stop_batch))
)
prf_options = None
prf_run_metadata = None
if self.profiling:
prf_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
prf_run_metadata = tf.RunMetadata()
# set tensorboard execution environment
if self.tensorboard:
summary_merged_op = tf.summary.merge_all()
# Remove TB old logging directory from previous run
try:
shutil.rmtree(self.tensorboard_log_dir)
except FileNotFoundError:
pass # directory does not exist, this is OK
except Exception as e:
# general error when removing directory, warn user
log.exception(
f"Could not remove old tensorboard logging directory: "
f"{self.tensorboard_log_dir}. Error: {e}"
)
else:
log.debug("Removing old tensorboard log directory.")
tb_train_writer = tf.summary.FileWriter(self.tensorboard_log_dir + '/train', self.sess.graph)
tb_valid_writer = tf.summary.FileWriter(self.tensorboard_log_dir + '/test')
else:
tb_train_writer = None
tb_valid_writer = None
if self.enable_profiler:
# https://www.tensorflow.org/guide/profiler
tfv2.profiler.experimental.start(self.tensorboard_log_dir)
train_time = 0
while cur_batch < stop_batch :
# first round validation:
train_batch = train_data.get_batch()
if self.display_in_training and is_first_step:
if self.run_opt.is_chief:
valid_batches = [valid_data.get_batch() for ii in range(self.valid_numb_batch)] if valid_data is not None else None
self.valid_on_the_fly(fp, [train_batch], valid_batches, print_header=True)
is_first_step = False
if self.timing_in_training: tic = time.time()
train_feed_dict = self.get_feed_dict(train_batch, is_training=True)
# use tensorboard to visualize the training of deepmd-kit
# it will takes some extra execution time to generate the tensorboard data
if self.tensorboard and (cur_batch % self.tensorboard_freq == 0):
summary, _ = run_sess(self.sess, [summary_merged_op, self.train_op], feed_dict=train_feed_dict,
options=prf_options, run_metadata=prf_run_metadata)
tb_train_writer.add_summary(summary, cur_batch)
else:
run_sess(self.sess, [self.train_op], feed_dict=train_feed_dict,
options=prf_options, run_metadata=prf_run_metadata)
if self.timing_in_training: toc = time.time()
if self.timing_in_training: train_time += toc - tic
cur_batch = run_sess(self.sess, self.global_step)
self.cur_batch = cur_batch
# on-the-fly validation
if self.display_in_training and (cur_batch % self.disp_freq == 0):
if self.timing_in_training:
tic = time.time()
if self.run_opt.is_chief:
valid_batches = [valid_data.get_batch() for ii in range(self.valid_numb_batch)] if valid_data is not None else None
self.valid_on_the_fly(fp, [train_batch], valid_batches)
if self.timing_in_training:
toc = time.time()
test_time = toc - tic
log.info("batch %7d training time %.2f s, testing time %.2f s"
% (cur_batch, train_time, test_time))
train_time = 0
if self.save_freq > 0 and cur_batch % self.save_freq == 0 and self.saver is not None:
self.save_checkpoint(cur_batch)
if (self.save_freq == 0 or cur_batch == 0 or cur_batch % self.save_freq != 0) and self.saver is not None:
self.save_checkpoint(cur_batch)
if self.run_opt.is_chief:
fp.close ()
if self.profiling and self.run_opt.is_chief :
fetched_timeline = timeline.Timeline(prf_run_metadata.step_stats)
chrome_trace = fetched_timeline.generate_chrome_trace_format()
with open(self.profiling_file, 'w') as f:
f.write(chrome_trace)
if self.enable_profiler and self.run_opt.is_chief:
tfv2.profiler.experimental.stop()
def save_checkpoint(self, cur_batch: int):
try:
ckpt_prefix = self.saver.save (self.sess, os.path.join(os.getcwd(), self.save_ckpt), global_step=cur_batch)
except google.protobuf.message.DecodeError as e:
raise GraphTooLargeError(
"The graph size exceeds 2 GB, the hard limitation of protobuf."
" Then a DecodeError was raised by protobuf. You should "
"reduce the size of your model."
) from e
# make symlinks from prefix with step to that without step to break nothing
# get all checkpoint files
original_files = glob.glob(ckpt_prefix + ".*")
for ori_ff in original_files:
new_ff = self.save_ckpt + ori_ff[len(ckpt_prefix):]
try:
# remove old one
os.remove(new_ff)
except OSError:
pass
if platform.system() != 'Windows':
# by default one does not have access to create symlink on Windows
os.symlink(ori_ff, new_ff)
else:
shutil.copyfile(ori_ff, new_ff)
log.info("saved checkpoint %s" % self.save_ckpt)
def get_feed_dict(self, batch, is_training):
feed_dict = {}
for kk in batch.keys():
if kk == 'find_type' or kk == 'type' or kk == 'real_natoms_vec':
continue
if 'find_' in kk:
feed_dict[self.place_holders[kk]] = batch[kk]
else:
feed_dict[self.place_holders[kk]] = np.reshape(batch[kk], [-1])
for ii in ['type']:
feed_dict[self.place_holders[ii]] = np.reshape(batch[ii], [-1])
for ii in ['natoms_vec', 'default_mesh']:
feed_dict[self.place_holders[ii]] = batch[ii]
feed_dict[self.place_holders['is_training']] = is_training
return feed_dict
def get_global_step(self):
return run_sess(self.sess, self.global_step)
# def print_head (self) : # depreciated
# if self.run_opt.is_chief:
# fp = open(self.disp_file, "a")
# print_str = "# %5s" % 'batch'
# print_str += self.loss.print_header()
# print_str += ' %8s\n' % 'lr'
# fp.write(print_str)
# fp.close ()
def valid_on_the_fly(self,
fp,
train_batches,
valid_batches,
print_header=False):
train_results = self.get_evaluation_results(train_batches)
valid_results = self.get_evaluation_results(valid_batches)
cur_batch = self.cur_batch
current_lr = run_sess(self.sess, self.learning_rate)
if print_header:
self.print_header(fp, train_results, valid_results)
self.print_on_training(fp, train_results, valid_results, cur_batch, current_lr)
@staticmethod
def print_header(fp, train_results, valid_results):
print_str = ''
print_str += "# %5s" % 'step'
if valid_results is not None:
prop_fmt = ' %11s %11s'
for k in train_results.keys():
print_str += prop_fmt % (k + '_val', k + '_trn')
else:
prop_fmt = ' %11s'
for k in train_results.keys():
print_str += prop_fmt % (k + '_trn')
print_str += ' %8s\n' % 'lr'
fp.write(print_str)
fp.flush()
@staticmethod
def print_on_training(fp, train_results, valid_results, cur_batch, cur_lr):
print_str = ''
print_str += "%7d" % cur_batch
if valid_results is not None:
prop_fmt = " %11.2e %11.2e"
for k in valid_results.keys():
# assert k in train_results.keys()
print_str += prop_fmt % (valid_results[k], train_results[k])
else:
prop_fmt = " %11.2e"
for k in train_results.keys():
print_str += prop_fmt % (train_results[k])
print_str += " %8.1e\n" % cur_lr
fp.write(print_str)
fp.flush()
def get_evaluation_results(self, batch_list):
if batch_list is None: return None
numb_batch = len(batch_list)
sum_results = {} # sum of losses on all atoms
sum_natoms = 0
for i in range(numb_batch):
batch = batch_list[i]
natoms = batch["natoms_vec"]
feed_dict = self.get_feed_dict(batch, is_training=False)
results = self.loss.eval(self.sess, feed_dict, natoms)
for k, v in results.items():
if k == "natoms":
sum_natoms += v
else:
sum_results[k] = sum_results.get(k, 0.) + v * results["natoms"]
avg_results = {k: v / sum_natoms for k, v in sum_results.items() if not k == "natoms"}
return avg_results
def save_compressed(self):
"""
Save the compressed graph
"""
self._init_session()
if self.is_compress:
self.saver.save (self.sess, os.path.join(os.getcwd(), self.save_ckpt))
def _get_place_horders(self, data_dict):
for kk in data_dict.keys():
if kk == 'type':
continue
prec = GLOBAL_TF_FLOAT_PRECISION
if data_dict[kk]['high_prec'] :
prec = GLOBAL_ENER_FLOAT_PRECISION
self.place_holders[kk] = tf.placeholder(prec, [None], name = 't_' + kk)
self.place_holders['find_' + kk] = tf.placeholder(tf.float32, name = 't_find_' + kk)
def _init_from_frz_model(self):
try:
graph, graph_def = load_graph_def(self.run_opt.init_frz_model)
except FileNotFoundError as e:
# throw runtime error if there's no frozen model
raise RuntimeError(
"The input frozen model %s (%s) does not exist! Please check the path of the frozen model. " % (self.run_opt.init_frz_model, os.path.abspath(self.run_opt.init_frz_model))
) from e
# get the model type from the frozen model(self.run_opt.init_frz_model)
try:
t_model_type = get_tensor_by_name_from_graph(graph, 'model_type')
except GraphWithoutTensorError as e:
# throw runtime error if the frozen_model has no model type information...
raise RuntimeError(
"The input frozen model: %s has no 'model_type' information, "
"which is not supported by the 'dp train init-frz-model' interface. " % self.run_opt.init_frz_model
) from e
else:
self.model_type = bytes.decode(t_model_type)
if self.model_type == 'compressed_model':
self.frz_model = self.run_opt.init_frz_model
self.model.init_variables(graph, graph_def, model_type=self.model_type)
#
from .data import DeepmdData
from .data_system import DeepmdDataSystem
# out-of-dated
from .data import DataSets
from .data_system import DataSystem
from .pair_tab import PairTab
from .learning_rate import LearningRateExp
from .plugin import Plugin, PluginVariant
from typing import List, Callable
from dargs import dargs, Argument, Variant, ArgumentEncoder
from deepmd import descriptor
from deepmd.common import ACTIVATION_FN_DICT, PRECISION_DICT
from deepmd.utils.plugin import Plugin
import json
from deepmd.nvnmd.utils.argcheck import nvnmd_args
def list_to_doc(xx):
items = []
for ii in xx:
if len(items) == 0:
items.append(f'"{ii}"')
else:
items.append(f', "{ii}"')
items.append('.')
return ''.join(items)
def make_link(content, ref_key):
return f'`{content} <{ref_key}_>`_' if not dargs.RAW_ANCHOR \
else f'`{content} <#{ref_key}>`_'
def type_embedding_args():
doc_neuron = 'Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.'
doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
doc_seed = 'Random seed for parameter initialization'
doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
doc_trainable = 'If the parameters in the embedding net are trainable'
return [
Argument("neuron", list, optional = True, default = [8], doc = doc_neuron),
Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function),
Argument("resnet_dt", bool, optional = True, default = False, doc = doc_resnet_dt),
Argument("precision", str, optional = True, default = "default", doc = doc_precision),
Argument("trainable", bool, optional = True, default = True, doc = doc_trainable),
Argument("seed", [int,None], optional = True, default = None, doc = doc_seed),
]
# --- Descriptor configurations: --- #
class ArgsPlugin:
def __init__(self) -> None:
self.__plugin = Plugin()
def register(self, name : str, alias : List[str] = None) -> Callable[[], List[Argument]]:
"""Regiester a descriptor argument plugin.
Parameters
----------
name : str
the name of a descriptor
alias : List[str], optional
the list of aliases of this descriptor
Returns
-------
Callable[[], List[Argument]]
the regiestered descriptor argument method
Examples
--------
>>> some_plugin = ArgsPlugin()
>>> @some_plugin.register("some_descrpt")
def descrpt_some_descrpt_args():
return []
"""
# convert alias to hashed item
if isinstance(alias, list):
alias = tuple(alias)
return self.__plugin.register((name, alias))
def get_all_argument(self, exclude_hybrid: bool = False) -> List[Argument]:
"""Get all arguments.
Parameters
----------
exclude_hybrid : bool
exclude hybrid descriptor to prevent circular calls
Returns
-------
List[Argument]
all arguments
"""
arguments = []
for (name, alias), metd in self.__plugin.plugins.items():
if exclude_hybrid and name == "hybrid":
continue
arguments.append(Argument(name=name, dtype=dict, sub_fields=metd(), alias=alias))
return arguments
descrpt_args_plugin = ArgsPlugin()
@descrpt_args_plugin.register("loc_frame")
def descrpt_local_frame_args ():
doc_sel_a = 'A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_a[i]` gives the selected number of type-i neighbors. The full relative coordinates of the neighbors are used by the descriptor.'
doc_sel_r = 'A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_r[i]` gives the selected number of type-i neighbors. Only relative distance of the neighbors are used by the descriptor. sel_a[i] + sel_r[i] is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius.'
doc_rcut = 'The cut-off radius. The default value is 6.0'
doc_axis_rule = 'A list of integers. The length should be 6 times of the number of types. \n\n\
- axis_rule[i*6+0]: class of the atom defining the first axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.\n\n\
- axis_rule[i*6+1]: type of the atom defining the first axis of type-i atom.\n\n\
- axis_rule[i*6+2]: index of the axis atom defining the first axis. Note that the neighbors with the same class and type are sorted according to their relative distance.\n\n\
- axis_rule[i*6+3]: class of the atom defining the first axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.\n\n\
- axis_rule[i*6+4]: type of the atom defining the second axis of type-i atom.\n\n\
- axis_rule[i*6+5]: class of the atom defining the second axis of type-i atom. 0 for neighbors with full coordinates and 1 for neighbors only with relative distance.'
return [
Argument("sel_a", list, optional = False, doc = doc_sel_a),
Argument("sel_r", list, optional = False, doc = doc_sel_r),
Argument("rcut", float, optional = True, default = 6.0, doc = doc_rcut),
Argument("axis_rule", list, optional = False, doc = doc_axis_rule)
]
@descrpt_args_plugin.register("se_e2_a", alias=["se_a"])
def descrpt_se_a_args():
doc_sel = 'This parameter set the number of selected neighbors for each type of atom. It can be:\n\n\
- `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
- `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
doc_rcut = 'The cut-off radius.'
doc_rcut_smth = 'Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`'
doc_neuron = 'Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.'
doc_axis_neuron = 'Size of the submatrix of G (embedding matrix).'
doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
doc_type_one_side = 'Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets'
doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
doc_trainable = 'If the parameters in the embedding net is trainable'
doc_seed = 'Random seed for parameter initialization'
doc_exclude_types = 'The excluded pairs of types which have no interaction with each other. For example, `[[0, 1]]` means no interaction between type 0 and type 1.'
doc_set_davg_zero = 'Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used'
return [
Argument("sel", [list,str], optional = True, default = "auto", doc = doc_sel),
Argument("rcut", float, optional = True, default = 6.0, doc = doc_rcut),
Argument("rcut_smth", float, optional = True, default = 0.5, doc = doc_rcut_smth),
Argument("neuron", list, optional = True, default = [10,20,40], doc = doc_neuron),
Argument("axis_neuron", int, optional = True, default = 4, alias = ['n_axis_neuron'], doc = doc_axis_neuron),
Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function),
Argument("resnet_dt", bool, optional = True, default = False, doc = doc_resnet_dt),
Argument("type_one_side", bool, optional = True, default = False, doc = doc_type_one_side),
Argument("precision", str, optional = True, default = "default", doc = doc_precision),
Argument("trainable", bool, optional = True, default = True, doc = doc_trainable),
Argument("seed", [int,None], optional = True, doc = doc_seed),
Argument("exclude_types", list, optional = True, default = [], doc = doc_exclude_types),
Argument("set_davg_zero", bool, optional = True, default = False, doc = doc_set_davg_zero)
]
@descrpt_args_plugin.register("se_e3", alias=['se_at', 'se_a_3be', 'se_t'])
def descrpt_se_t_args():
doc_sel = 'This parameter set the number of selected neighbors for each type of atom. It can be:\n\n\
- `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
- `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
doc_rcut = 'The cut-off radius.'
doc_rcut_smth = 'Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`'
doc_neuron = 'Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.'
doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
doc_trainable = 'If the parameters in the embedding net are trainable'
doc_seed = 'Random seed for parameter initialization'
doc_set_davg_zero = 'Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used'
return [
Argument("sel", [list,str], optional = True, default = "auto", doc = doc_sel),
Argument("rcut", float, optional = True, default = 6.0, doc = doc_rcut),
Argument("rcut_smth", float, optional = True, default = 0.5, doc = doc_rcut_smth),
Argument("neuron", list, optional = True, default = [10,20,40], doc = doc_neuron),
Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function),
Argument("resnet_dt", bool, optional = True, default = False, doc = doc_resnet_dt),
Argument("precision", str, optional = True, default = "default", doc = doc_precision),
Argument("trainable", bool, optional = True, default = True, doc = doc_trainable),
Argument("seed", [int,None], optional = True, doc = doc_seed),
Argument("set_davg_zero", bool, optional = True, default = False, doc = doc_set_davg_zero)
]
@descrpt_args_plugin.register("se_a_tpe", alias=['se_a_ebd'])
def descrpt_se_a_tpe_args():
doc_type_nchanl = 'number of channels for type embedding'
doc_type_nlayer = 'number of hidden layers of type embedding net'
doc_numb_aparam = 'dimension of atomic parameter. if set to a value > 0, the atomic parameters are embedded.'
return descrpt_se_a_args() + [
Argument("type_nchanl", int, optional = True, default = 4, doc = doc_type_nchanl),
Argument("type_nlayer", int, optional = True, default = 2, doc = doc_type_nlayer),
Argument("numb_aparam", int, optional = True, default = 0, doc = doc_numb_aparam)
]
@descrpt_args_plugin.register("se_e2_r", alias=['se_r'])
def descrpt_se_r_args():
doc_sel = 'This parameter set the number of selected neighbors for each type of atom. It can be:\n\n\
- `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
- `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
doc_rcut = 'The cut-off radius.'
doc_rcut_smth = 'Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`'
doc_neuron = 'Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.'
doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
doc_type_one_side = 'Try to build N_types embedding nets. Otherwise, building N_types^2 embedding nets'
doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
doc_trainable = 'If the parameters in the embedding net are trainable'
doc_seed = 'Random seed for parameter initialization'
doc_exclude_types = 'The excluded pairs of types which have no interaction with each other. For example, `[[0, 1]]` means no interaction between type 0 and type 1.'
doc_set_davg_zero = 'Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used'
return [
Argument("sel", [list,str], optional = True, default = "auto", doc = doc_sel),
Argument("rcut", float, optional = True, default = 6.0, doc = doc_rcut),
Argument("rcut_smth", float, optional = True, default = 0.5, doc = doc_rcut_smth),
Argument("neuron", list, optional = True, default = [10,20,40], doc = doc_neuron),
Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function),
Argument("resnet_dt", bool, optional = True, default = False, doc = doc_resnet_dt),
Argument("type_one_side", bool, optional = True, default = False, doc = doc_type_one_side),
Argument("precision", str, optional = True, default = "default", doc = doc_precision),
Argument("trainable", bool, optional = True, default = True, doc = doc_trainable),
Argument("seed", [int,None], optional = True, doc = doc_seed),
Argument("exclude_types", list, optional = True, default = [], doc = doc_exclude_types),
Argument("set_davg_zero", bool, optional = True, default = False, doc = doc_set_davg_zero)
]
@descrpt_args_plugin.register("hybrid")
def descrpt_hybrid_args():
doc_list = f'A list of descriptor definitions'
return [
Argument("list", list, optional = False, doc = doc_list)
]
@descrpt_args_plugin.register("se_atten")
def descrpt_se_atten_args():
doc_sel = 'This parameter set the number of selected neighbors. Note that this parameter is a little different from that in other descriptors. Instead of separating each type of atoms, only the summation matters. And this number is highly related with the efficiency, thus one should not make it too large. Usually 200 or less is enough, far away from the GPU limitation 4096. It can be:\n\n\
- `int`. The maximum number of neighbor atoms to be considered. We recommend it to be less than 200. \n\n\
- `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. Only the summation of `sel[i]` matters, and it is recommended to be less than 200.\
- `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wraped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
doc_rcut = 'The cut-off radius.'
doc_rcut_smth = 'Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`'
doc_neuron = 'Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built.'
doc_axis_neuron = 'Size of the submatrix of G (embedding matrix).'
doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
doc_type_one_side = 'Whether to consider the information from only one side or both sides.'
doc_precision = f'The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
doc_trainable = 'If the parameters in the embedding net is trainable'
doc_seed = 'Random seed for parameter initialization'
doc_exclude_types = 'The excluded pairs of types which have no interaction with each other. For example, `[[0, 1]]` means no interaction between type 0 and type 1.'
doc_set_davg_zero = 'Set the normalization average to zero. This option should be set when `atom_ener` in the energy fitting is used'
doc_attn = 'The length of hidden vectors in attention layers'
doc_attn_layer = 'The number of attention layers'
doc_attn_dotr = 'Whether to do dot product with the normalized relative coordinates'
doc_attn_mask = 'Whether to do mask on the diagonal in the attention matrix'
return [
Argument("sel", [int, list, str], optional=True, default="auto", doc=doc_sel),
Argument("rcut", float, optional=True, default=6.0, doc=doc_rcut),
Argument("rcut_smth", float, optional=True, default=0.5, doc=doc_rcut_smth),
Argument("neuron", list, optional=True, default=[10, 20, 40], doc=doc_neuron),
Argument("axis_neuron", int, optional=True, default=4, alias=['n_axis_neuron'], doc=doc_axis_neuron),
Argument("activation_function", str, optional=True, default='tanh', doc=doc_activation_function),
Argument("resnet_dt", bool, optional=True, default=False, doc=doc_resnet_dt),
Argument("type_one_side", bool, optional=True, default=False, doc=doc_type_one_side),
Argument("precision", str, optional=True, default="default", doc=doc_precision),
Argument("trainable", bool, optional=True, default=True, doc=doc_trainable),
Argument("seed", [int, None], optional=True, doc=doc_seed),
Argument("exclude_types", list, optional=True, default=[], doc=doc_exclude_types),
Argument("set_davg_zero", bool, optional=True, default=False, doc=doc_set_davg_zero),
Argument("attn", int, optional=True, default=128, doc=doc_attn),
Argument("attn_layer", int, optional=True, default=2, doc=doc_attn_layer),
Argument("attn_dotr", bool, optional=True, default=True, doc=doc_attn_dotr),
Argument("attn_mask", bool, optional=True, default=False, doc=doc_attn_mask)
]
def descrpt_variant_type_args(exclude_hybrid: bool = False) -> Variant:
link_lf = make_link('loc_frame', 'model/descriptor[loc_frame]')
link_se_e2_a = make_link('se_e2_a', 'model/descriptor[se_e2_a]')
link_se_e2_r = make_link('se_e2_r', 'model/descriptor[se_e2_r]')
link_se_e3 = make_link('se_e3', 'model/descriptor[se_e3]')
link_se_a_tpe = make_link('se_a_tpe', 'model/descriptor[se_a_tpe]')
link_hybrid = make_link('hybrid', 'model/descriptor[hybrid]')
link_se_atten = make_link('se_atten', 'model/descriptor[se_atten]')
doc_descrpt_type = f'The type of the descritpor. See explanation below. \n\n\
- `loc_frame`: Defines a local frame at each atom, and the compute the descriptor as local coordinates under this frame.\n\n\
- `se_e2_a`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor.\n\n\
- `se_e2_r`: Used by the smooth edition of Deep Potential. Only the distance between atoms is used to construct the descriptor.\n\n\
- `se_e3`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Three-body embedding will be used by this descriptor.\n\n\
- `se_a_tpe`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Type embedding will be used by this descriptor.\n\n\
- `se_atten`: Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor. Attention mechanism will be used by this descriptor.\n\n\
- `hybrid`: Concatenate of a list of descriptors as a new descriptor.'
return Variant("type", descrpt_args_plugin.get_all_argument(), doc = doc_descrpt_type)
# --- Fitting net configurations: --- #
def fitting_ener():
doc_numb_fparam = 'The dimension of the frame parameter. If set to >0, file `fparam.npy` should be included to provided the input fparams.'
doc_numb_aparam = 'The dimension of the atomic parameter. If set to >0, file `aparam.npy` should be included to provided the input aparams.'
doc_neuron = 'The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.'
doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
doc_precision = f'The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
doc_trainable = 'Whether the parameters in the fitting net are trainable. This option can be\n\n\
- bool: True if all parameters of the fitting net are trainable, False otherwise.\n\n\
- list of bool: Specifies if each layer is trainable. Since the fitting net is composed by hidden layers followed by a output layer, the length of tihs list should be equal to len(`neuron`)+1.'
doc_rcond = 'The condition number used to determine the inital energy shift for each type of atoms.'
doc_seed = 'Random seed for parameter initialization of the fitting net'
doc_atom_ener = 'Specify the atomic energy in vacuum for each type'
return [
Argument("numb_fparam", int, optional = True, default = 0, doc = doc_numb_fparam),
Argument("numb_aparam", int, optional = True, default = 0, doc = doc_numb_aparam),
Argument("neuron", list, optional = True, default = [120,120,120], alias = ['n_neuron'], doc = doc_neuron),
Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function),
Argument("precision", str, optional = True, default = 'default', doc = doc_precision),
Argument("resnet_dt", bool, optional = True, default = True, doc = doc_resnet_dt),
Argument("trainable", [list,bool], optional = True, default = True, doc = doc_trainable),
Argument("rcond", float, optional = True, default = 1e-3, doc = doc_rcond),
Argument("seed", [int,None], optional = True, doc = doc_seed),
Argument("atom_ener", list, optional = True, default = [], doc = doc_atom_ener)
]
def fitting_polar():
doc_neuron = 'The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.'
doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
doc_precision = f'The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
doc_scale = 'The output of the fitting net (polarizability matrix) will be scaled by ``scale``'
#doc_diag_shift = 'The diagonal part of the polarizability matrix will be shifted by ``diag_shift``. The shift operation is carried out after ``scale``.'
doc_fit_diag = 'Fit the diagonal part of the rotational invariant polarizability matrix, which will be converted to normal polarizability matrix by contracting with the rotation matrix.'
doc_sel_type = 'The atom types for which the atomic polarizability will be provided. If not set, all types will be selected.'
doc_seed = 'Random seed for parameter initialization of the fitting net'
# YWolfeee: user can decide whether to use shift diag
doc_shift_diag = 'Whether to shift the diagonal of polar, which is beneficial to training. Default is true.'
return [
Argument("neuron", list, optional = True, default = [120,120,120], alias = ['n_neuron'], doc = doc_neuron),
Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function),
Argument("resnet_dt", bool, optional = True, default = True, doc = doc_resnet_dt),
Argument("precision", str, optional = True, default = 'default', doc = doc_precision),
Argument("fit_diag", bool, optional = True, default = True, doc = doc_fit_diag),
Argument("scale", [list,float], optional = True, default = 1.0, doc = doc_scale),
#Argument("diag_shift", [list,float], optional = True, default = 0.0, doc = doc_diag_shift),
Argument("shift_diag", bool, optional = True, default = True, doc = doc_shift_diag),
Argument("sel_type", [list,int,None], optional = True, alias = ['pol_type'], doc = doc_sel_type),
Argument("seed", [int,None], optional = True, doc = doc_seed)
]
#def fitting_global_polar():
# return fitting_polar()
def fitting_dipole():
doc_neuron = 'The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built.'
doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version.'
doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
doc_precision = f'The precision of the fitting net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision.'
doc_sel_type = 'The atom types for which the atomic dipole will be provided. If not set, all types will be selected.'
doc_seed = 'Random seed for parameter initialization of the fitting net'
return [
Argument("neuron", list, optional = True, default = [120,120,120], alias = ['n_neuron'], doc = doc_neuron),
Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function),
Argument("resnet_dt", bool, optional = True, default = True, doc = doc_resnet_dt),
Argument("precision", str, optional = True, default = 'default', doc = doc_precision),
Argument("sel_type", [list,int,None], optional = True, alias = ['dipole_type'], doc = doc_sel_type),
Argument("seed", [int,None], optional = True, doc = doc_seed)
]
# YWolfeee: Delete global polar mode, merge it into polar mode and use loss setting to support.
def fitting_variant_type_args():
doc_descrpt_type = 'The type of the fitting. See explanation below. \n\n\
- `ener`: Fit an energy model (potential energy surface).\n\n\
- `dipole`: Fit an atomic dipole model. Global dipole labels or atomic dipole labels for all the selected atoms (see `sel_type`) should be provided by `dipole.npy` in each data system. The file either has number of frames lines and 3 times of number of selected atoms columns, or has number of frames lines and 3 columns. See `loss` parameter.\n\n\
- `polar`: Fit an atomic polarizability model. Global polarizazbility labels or atomic polarizability labels for all the selected atoms (see `sel_type`) should be provided by `polarizability.npy` in each data system. The file eith has number of frames lines and 9 times of number of selected atoms columns, or has number of frames lines and 9 columns. See `loss` parameter.\n\n'
return Variant("type", [Argument("ener", dict, fitting_ener()),
Argument("dipole", dict, fitting_dipole()),
Argument("polar", dict, fitting_polar()),
],
optional = True,
default_tag = 'ener',
doc = doc_descrpt_type)
# --- Modifier configurations: --- #
def modifier_dipole_charge():
doc_model_name = "The name of the frozen dipole model file."
doc_model_charge_map = f"The charge of the WFCC. The list length should be the same as the {make_link('sel_type', 'model/fitting_net[dipole]/sel_type')}. "
doc_sys_charge_map = f"The charge of real atoms. The list length should be the same as the {make_link('type_map', 'model/type_map')}"
doc_ewald_h = f"The grid spacing of the FFT grid. Unit is A"
doc_ewald_beta = f"The splitting parameter of Ewald sum. Unit is A^{-1}"
return [
Argument("model_name", str, optional = False, doc = doc_model_name),
Argument("model_charge_map", list, optional = False, doc = doc_model_charge_map),
Argument("sys_charge_map", list, optional = False, doc = doc_sys_charge_map),
Argument("ewald_beta", float, optional = True, default = 0.4, doc = doc_ewald_beta),
Argument("ewald_h", float, optional = True, default = 1.0, doc = doc_ewald_h),
]
def modifier_variant_type_args():
doc_modifier_type = "The type of modifier. See explanation below.\n\n\
-`dipole_charge`: Use WFCC to model the electronic structure of the system. Correct the long-range interaction"
return Variant("type",
[
Argument("dipole_charge", dict, modifier_dipole_charge()),
],
optional = False,
doc = doc_modifier_type)
# --- model compression configurations: --- #
def model_compression():
doc_model_file = f"The input model file, which will be compressed by the DeePMD-kit."
doc_table_config = f"The arguments of model compression, including extrapolate(scale of model extrapolation), stride(uniform stride of tabulation's first and second table), and frequency(frequency of tabulation overflow check)."
doc_min_nbor_dist = f"The nearest distance between neighbor atoms saved in the frozen model."
return [
Argument("model_file", str, optional = False, doc = doc_model_file),
Argument("table_config", list, optional = False, doc = doc_table_config),
Argument("min_nbor_dist", float, optional = False, doc = doc_min_nbor_dist),
]
# --- model compression configurations: --- #
def model_compression_type_args():
doc_compress_type = "The type of model compression, which should be consistent with the descriptor type."
return Variant("type", [
Argument("se_e2_a", dict, model_compression(), alias = ['se_a'])
],
optional = True,
default_tag = 'se_e2_a',
doc = doc_compress_type)
def model_args ():
doc_type_map = 'A list of strings. Give the name to each type of atoms. It is noted that the number of atom type of training system must be less than 128 in a GPU environment.'
doc_data_stat_nbatch = 'The model determines the normalization from the statistics of the data. This key specifies the number of `frames` in each `system` used for statistics.'
doc_data_stat_protect = 'Protect parameter for atomic energy regression.'
doc_type_embedding = "The type embedding."
doc_descrpt = 'The descriptor of atomic environment.'
doc_fitting = 'The fitting of physical properties.'
doc_modifier = 'The modifier of model output.'
doc_use_srtab = 'The table for the short-range pairwise interaction added on top of DP. The table is a text data file with (N_t + 1) * N_t / 2 + 1 columes. The first colume is the distance between atoms. The second to the last columes are energies for pairs of certain types. For example we have two atom types, 0 and 1. The columes from 2nd to 4th are for 0-0, 0-1 and 1-1 correspondingly.'
doc_smin_alpha = 'The short-range tabulated interaction will be swithed according to the distance of the nearest neighbor. This distance is calculated by softmin. This parameter is the decaying parameter in the softmin. It is only required when `use_srtab` is provided.'
doc_sw_rmin = 'The lower boundary of the interpolation between short-range tabulated interaction and DP. It is only required when `use_srtab` is provided.'
doc_sw_rmax = 'The upper boundary of the interpolation between short-range tabulated interaction and DP. It is only required when `use_srtab` is provided.'
doc_compress_config = 'Model compression configurations'
ca = Argument("model", dict,
[Argument("type_map", list, optional = True, doc = doc_type_map),
Argument("data_stat_nbatch", int, optional = True, default = 10, doc = doc_data_stat_nbatch),
Argument("data_stat_protect", float, optional = True, default = 1e-2, doc = doc_data_stat_protect),
Argument("use_srtab", str, optional = True, doc = doc_use_srtab),
Argument("smin_alpha", float, optional = True, doc = doc_smin_alpha),
Argument("sw_rmin", float, optional = True, doc = doc_sw_rmin),
Argument("sw_rmax", float, optional = True, doc = doc_sw_rmax),
Argument("type_embedding", dict, type_embedding_args(), [], optional = True, doc = doc_type_embedding),
Argument("descriptor", dict, [], [descrpt_variant_type_args()], doc = doc_descrpt),
Argument("fitting_net", dict, [], [fitting_variant_type_args()], doc = doc_fitting),
Argument("modifier", dict, [], [modifier_variant_type_args()], optional = True, doc = doc_modifier),
Argument("compress", dict, [], [model_compression_type_args()], optional = True, doc = doc_compress_config)
])
# print(ca.gen_doc())
return ca
# --- Learning rate configurations: --- #
def learning_rate_exp():
doc_start_lr = 'The learning rate the start of the training.'
doc_stop_lr = 'The desired learning rate at the end of the training.'
doc_decay_steps = 'The learning rate is decaying every this number of training steps.'
args = [
Argument("start_lr", float, optional = True, default = 1e-3, doc = doc_start_lr),
Argument("stop_lr", float, optional = True, default = 1e-8, doc = doc_stop_lr),
Argument("decay_steps", int, optional = True, default = 5000, doc = doc_decay_steps)
]
return args
def learning_rate_variant_type_args():
doc_lr = 'The type of the learning rate.'
return Variant("type",
[Argument("exp", dict, learning_rate_exp())],
optional = True,
default_tag = 'exp',
doc = doc_lr)
def learning_rate_args():
doc_scale_by_worker = 'When parallel training or batch size scaled, how to alter learning rate. Valid values are `linear`(default), `sqrt` or `none`.'
doc_lr = "The definitio of learning rate"
return Argument("learning_rate", dict,
[Argument("scale_by_worker", str, optional=True, default='linear', doc=doc_scale_by_worker)],
[learning_rate_variant_type_args()],
doc = doc_lr)
# --- Loss configurations: --- #
def start_pref(item):
return f'The prefactor of {item} loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the {item} label should be provided by file {item}.npy in each data system. If both start_pref_{item} and limit_pref_{item} are set to 0, then the {item} will be ignored.'
def limit_pref(item):
return f'The prefactor of {item} loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity.'
def loss_ener():
doc_start_pref_e = start_pref('energy')
doc_limit_pref_e = limit_pref('energy')
doc_start_pref_f = start_pref('force')
doc_limit_pref_f = limit_pref('force')
doc_start_pref_v = start_pref('virial')
doc_limit_pref_v = limit_pref('virial')
doc_start_pref_ae = start_pref('atom_ener')
doc_limit_pref_ae = limit_pref('atom_ener')
doc_start_pref_pf = start_pref('atom_pref')
doc_limit_pref_pf = limit_pref('atom_pref')
doc_relative_f = 'If provided, relative force error will be used in the loss. The difference of force will be normalized by the magnitude of the force in the label with a shift given by `relative_f`, i.e. DF_i / ( || F || + relative_f ) with DF denoting the difference between prediction and label and || F || denoting the L2 norm of the label.'
doc_enable_atom_ener_coeff = "If true, the energy will be computed as \sum_i c_i E_i. c_i should be provided by file atom_ener_coeff.npy in each data system, otherwise it's 1."
return [
Argument("start_pref_e", [float,int], optional = True, default = 0.02, doc = doc_start_pref_e),
Argument("limit_pref_e", [float,int], optional = True, default = 1.00, doc = doc_limit_pref_e),
Argument("start_pref_f", [float,int], optional = True, default = 1000, doc = doc_start_pref_f),
Argument("limit_pref_f", [float,int], optional = True, default = 1.00, doc = doc_limit_pref_f),
Argument("start_pref_v", [float,int], optional = True, default = 0.00, doc = doc_start_pref_v),
Argument("limit_pref_v", [float,int], optional = True, default = 0.00, doc = doc_limit_pref_v),
Argument("start_pref_ae", [float,int], optional = True, default = 0.00, doc = doc_start_pref_ae),
Argument("limit_pref_ae", [float,int], optional = True, default = 0.00, doc = doc_limit_pref_ae),
Argument("start_pref_pf", [float,int], optional = True, default = 0.00, doc = doc_start_pref_pf),
Argument("limit_pref_pf", [float,int], optional = True, default = 0.00, doc = doc_limit_pref_pf),
Argument("relative_f", [float,None], optional = True, doc = doc_relative_f),
Argument("enable_atom_ener_coeff", [bool], optional=True, default=False, doc=doc_enable_atom_ener_coeff),
]
# YWolfeee: Modified to support tensor type of loss args.
def loss_tensor():
#doc_global_weight = "The prefactor of the weight of global loss. It should be larger than or equal to 0. If only `pref` is provided or both are not provided, training will be global mode, i.e. the shape of 'polarizability.npy` or `dipole.npy` should be #frams x [9 or 3]."
#doc_local_weight = "The prefactor of the weight of atomic loss. It should be larger than or equal to 0. If only `pref_atomic` is provided, training will be atomic mode, i.e. the shape of `polarizability.npy` or `dipole.npy` should be #frames x ([9 or 3] x #selected atoms). If both `pref` and `pref_atomic` are provided, training will be combined mode, and atomic label should be provided as well."
doc_global_weight = "The prefactor of the weight of global loss. It should be larger than or equal to 0. If controls the weight of loss corresponding to global label, i.e. 'polarizability.npy` or `dipole.npy`, whose shape should be #frames x [9 or 3]. If it's larger than 0.0, this npy should be included."
doc_local_weight = "The prefactor of the weight of atomic loss. It should be larger than or equal to 0. If controls the weight of loss corresponding to atomic label, i.e. `atomic_polarizability.npy` or `atomic_dipole.npy`, whose shape should be #frames x ([9 or 3] x #selected atoms). If it's larger than 0.0, this npy should be included. Both `pref` and `pref_atomic` should be provided, and either can be set to 0.0."
return [
Argument("pref", [float,int], optional = False, default = None, doc = doc_global_weight),
Argument("pref_atomic", [float,int], optional = False, default = None, doc = doc_local_weight),
]
def loss_variant_type_args():
doc_loss = 'The type of the loss. When the fitting type is `ener`, the loss type should be set to `ener` or left unset. When the fitting type is `dipole` or `polar`, the loss type should be set to `tensor`. \n\.'
return Variant("type",
[Argument("ener", dict, loss_ener()),
Argument("tensor", dict, loss_tensor()),
#Argument("polar", dict, loss_tensor()),
#Argument("global_polar", dict, loss_tensor("global"))
],
optional = True,
default_tag = 'ener',
doc = doc_loss)
def loss_args():
doc_loss = 'The definition of loss function. The loss type should be set to `tensor`, `ener` or left unset.\n\.'
ca = Argument('loss', dict, [],
[loss_variant_type_args()],
optional = True,
doc = doc_loss)
return ca
# --- Training configurations: --- #
def training_data_args(): # ! added by Ziyao: new specification style for data systems.
link_sys = make_link("systems", "training/training_data/systems")
doc_systems = 'The data systems for training. ' \
'This key can be provided with a list that specifies the systems, or be provided with a string ' \
'by which the prefix of all systems are given and the list of the systems is automatically generated.'
doc_set_prefix = f'The prefix of the sets in the {link_sys}.'
doc_batch_size = f'This key can be \n\n\
- list: the length of which is the same as the {link_sys}. The batch size of each system is given by the elements of the list.\n\n\
- int: all {link_sys} use the same batch size.\n\n\
- string "auto": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than 32.\n\n\
- string "auto:N": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than N.'
doc_auto_prob_style = 'Determine the probability of systems automatically. The method is assigned by this key and can be\n\n\
- "prob_uniform" : the probability all the systems are equal, namely 1.0/self.get_nsystems()\n\n\
- "prob_sys_size" : the probability of a system is proportional to the number of batches in the system\n\n\
- "prob_sys_size;stt_idx:end_idx:weight;stt_idx:end_idx:weight;..." : the list of systems is devided into blocks. A block is specified by `stt_idx:end_idx:weight`, where `stt_idx` is the starting index of the system, `end_idx` is then ending (not including) index of the system, the probabilities of the systems in this block sums up to `weight`, and the relatively probabilities within this block is proportional to the number of batches in the system.'
doc_sys_probs = "A list of float if specified. " \
"Should be of the same length as `systems`, " \
"specifying the probability of each system."
args = [
Argument("systems", [list, str], optional=False, default=".", doc=doc_systems),
Argument("set_prefix", str, optional=True, default='set', doc=doc_set_prefix),
Argument("batch_size", [list, int, str], optional=True, default='auto', doc=doc_batch_size),
Argument("auto_prob", str, optional=True, default="prob_sys_size",
doc=doc_auto_prob_style, alias=["auto_prob_style",]),
Argument("sys_probs", list, optional=True, default=None, doc=doc_sys_probs, alias=["sys_weights"]),
]
doc_training_data = "Configurations of training data."
return Argument("training_data", dict, optional=False,
sub_fields=args, sub_variants=[], doc=doc_training_data)
def validation_data_args(): # ! added by Ziyao: new specification style for data systems.
link_sys = make_link("systems", "training/validation_data/systems")
doc_systems = 'The data systems for validation. ' \
'This key can be provided with a list that specifies the systems, or be provided with a string ' \
'by which the prefix of all systems are given and the list of the systems is automatically generated.'
doc_set_prefix = f'The prefix of the sets in the {link_sys}.'
doc_batch_size = f'This key can be \n\n\
- list: the length of which is the same as the {link_sys}. The batch size of each system is given by the elements of the list.\n\n\
- int: all {link_sys} use the same batch size.\n\n\
- string "auto": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than 32.\n\n\
- string "auto:N": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than N.'
doc_auto_prob_style = 'Determine the probability of systems automatically. The method is assigned by this key and can be\n\n\
- "prob_uniform" : the probability all the systems are equal, namely 1.0/self.get_nsystems()\n\n\
- "prob_sys_size" : the probability of a system is proportional to the number of batches in the system\n\n\
- "prob_sys_size;stt_idx:end_idx:weight;stt_idx:end_idx:weight;..." : the list of systems is devided into blocks. A block is specified by `stt_idx:end_idx:weight`, where `stt_idx` is the starting index of the system, `end_idx` is then ending (not including) index of the system, the probabilities of the systems in this block sums up to `weight`, and the relatively probabilities within this block is proportional to the number of batches in the system.'
doc_sys_probs = "A list of float if specified. " \
"Should be of the same length as `systems`, " \
"specifying the probability of each system."
doc_numb_btch = "An integer that specifies the number of systems to be sampled for each validation period."
args = [
Argument("systems", [list, str], optional=False, default=".", doc=doc_systems),
Argument("set_prefix", str, optional=True, default='set', doc=doc_set_prefix),
Argument("batch_size", [list, int, str], optional=True, default='auto', doc=doc_batch_size),
Argument("auto_prob", str, optional=True, default="prob_sys_size",
doc=doc_auto_prob_style, alias=["auto_prob_style", ]),
Argument("sys_probs", list, optional=True, default=None, doc=doc_sys_probs, alias=["sys_weights"]),
Argument("numb_btch", int, optional=True, default=1, doc=doc_numb_btch, alias=["numb_batch", ])
]
doc_validation_data = "Configurations of validation data. Similar to that of training data, " \
"except that a `numb_btch` argument may be configured"
return Argument("validation_data", dict, optional=True, default=None,
sub_fields=args, sub_variants=[], doc=doc_validation_data)
def mixed_precision_args(): # ! added by Denghui.
doc_output_prec = 'The precision for mixed precision params. " \
"The trainable variables precision during the mixed precision training process, " \
"supported options are float32 only currently.'
doc_compute_prec = 'The precision for mixed precision compute. " \
"The compute precision during the mixed precision training process, "" \
"supported options are float16 only currently.'
args = [
Argument("output_prec", str, optional=True, default="float32", doc=doc_output_prec),
Argument("compute_prec", str, optional=False, default="float16", doc=doc_compute_prec),
]
doc_mixed_precision = "Configurations of mixed precision."
return Argument("mixed_precision", dict, optional=True,
sub_fields=args, sub_variants=[], doc=doc_mixed_precision)
def training_args(): # ! modified by Ziyao: data configuration isolated.
doc_numb_steps = 'Number of training batch. Each training uses one batch of data.'
doc_seed = 'The random seed for getting frames from the training data set.'
doc_disp_file = 'The file for printing learning curve.'
doc_disp_freq = 'The frequency of printing learning curve.'
doc_save_freq = 'The frequency of saving check point.'
doc_save_ckpt = 'The file name of saving check point.'
doc_disp_training = 'Displaying verbose information during training.'
doc_time_training = 'Timing durining training.'
doc_profiling = 'Profiling during training.'
doc_profiling_file = 'Output file for profiling.'
doc_enable_profiler = 'Enable TensorFlow Profiler (available in TensorFlow 2.3) to analyze performance. The log will be saved to `tensorboard_log_dir`.'
doc_tensorboard = 'Enable tensorboard'
doc_tensorboard_log_dir = 'The log directory of tensorboard outputs'
doc_tensorboard_freq = 'The frequency of writing tensorboard events.'
arg_training_data = training_data_args()
arg_validation_data = validation_data_args()
mixed_precision_data = mixed_precision_args()
args = [
arg_training_data,
arg_validation_data,
mixed_precision_data,
Argument("numb_steps", int, optional=False, doc=doc_numb_steps, alias=["stop_batch"]),
Argument("seed", [int,None], optional=True, doc=doc_seed),
Argument("disp_file", str, optional=True, default='lcurve.out', doc=doc_disp_file),
Argument("disp_freq", int, optional=True, default=1000, doc=doc_disp_freq),
Argument("save_freq", int, optional=True, default=1000, doc=doc_save_freq),
Argument("save_ckpt", str, optional=True, default='model.ckpt', doc=doc_save_ckpt),
Argument("disp_training", bool, optional=True, default=True, doc=doc_disp_training),
Argument("time_training", bool, optional=True, default=True, doc=doc_time_training),
Argument("profiling", bool, optional=True, default=False, doc=doc_profiling),
Argument("profiling_file", str, optional=True, default='timeline.json', doc=doc_profiling_file),
Argument("enable_profiler", bool, optional=True, default=False, doc=doc_enable_profiler),
Argument("tensorboard", bool, optional=True, default=False, doc=doc_tensorboard),
Argument("tensorboard_log_dir", str, optional=True, default='log', doc=doc_tensorboard_log_dir),
Argument("tensorboard_freq", int, optional=True, default=1, doc=doc_tensorboard_freq),
]
doc_training = 'The training options.'
return Argument("training", dict, args, [], doc = doc_training)
def make_index(keys):
ret = []
for ii in keys:
ret.append(make_link(ii, ii))
return ', '.join(ret)
def gen_doc(*, make_anchor=True, make_link=True, **kwargs):
if make_link:
make_anchor = True
ma = model_args()
lra = learning_rate_args()
la = loss_args()
ta = training_args()
nvnmda = nvnmd_args()
ptr = []
ptr.append(ma.gen_doc(make_anchor=make_anchor, make_link=make_link, **kwargs))
ptr.append(la.gen_doc(make_anchor=make_anchor, make_link=make_link, **kwargs))
ptr.append(lra.gen_doc(make_anchor=make_anchor, make_link=make_link, **kwargs))
ptr.append(ta.gen_doc(make_anchor=make_anchor, make_link=make_link, **kwargs))
ptr.append(nvnmda.gen_doc(make_anchor=make_anchor, make_link=make_link, **kwargs))
key_words = []
for ii in "\n\n".join(ptr).split('\n'):
if 'argument path' in ii:
key_words.append(ii.split(':')[1].replace('`','').strip())
#ptr.insert(0, make_index(key_words))
return "\n\n".join(ptr)
def gen_json(**kwargs):
return json.dumps((
model_args(),
learning_rate_args(),
loss_args(),
training_args(),
nvnmd_args(),
), cls=ArgumentEncoder)
def normalize_hybrid_list(hy_list):
new_list = []
base = Argument("base", dict, [], [descrpt_variant_type_args()], doc = "")
for ii in range(len(hy_list)):
data = base.normalize_value(hy_list[ii], trim_pattern="_*")
base.check_value(data, strict=True)
new_list.append(data)
return new_list
def normalize(data):
if "hybrid" == data["model"]["descriptor"]["type"]:
data["model"]["descriptor"]["list"] \
= normalize_hybrid_list(data["model"]["descriptor"]["list"])
ma = model_args()
lra = learning_rate_args()
la = loss_args()
ta = training_args()
nvnmda = nvnmd_args()
base = Argument("base", dict, [ma, lra, la, ta, nvnmda])
data = base.normalize_value(data, trim_pattern="_*")
base.check_value(data, strict=True)
return data
if __name__ == '__main__':
gen_doc()
import logging
from typing import Callable, Tuple
import numpy as np
from deepmd.utils.errors import OutOfMemoryError
class AutoBatchSize:
"""This class allows DeePMD-kit to automatically decide the maximum
batch size that will not cause an OOM error.
Notes
-----
We assume all OOM error will raise :class:`OutOfMemoryError`.
Parameters
----------
initial_batch_size : int, default: 1024
initial batch size (number of total atoms)
factor : float, default: 2.
increased factor
Attributes
----------
current_batch_size : int
current batch size (number of total atoms)
maximum_working_batch_size : int
maximum working batch size
minimal_not_working_batch_size : int
minimal not working batch size
"""
def __init__(self, initial_batch_size: int = 1024, factor: float = 2.) -> None:
# See also PyTorchLightning/pytorch-lightning#1638
# TODO: discuss a proper initial batch size
self.current_batch_size = initial_batch_size
self.maximum_working_batch_size = 0
self.minimal_not_working_batch_size = 2**31
self.factor = factor
def execute(self, callable: Callable, start_index: int, natoms: int) -> Tuple[int, tuple]:
"""Excuate a method with given batch size.
Parameters
----------
callable : Callable
The method should accept the batch size and start_index as parameters,
and returns executed batch size and data.
start_index : int
start index
natoms : int
natoms
Returns
-------
int
executed batch size * number of atoms
tuple
result from callable, None if failing to execute
Raises
------
OutOfMemoryError
OOM when batch size is 1
"""
try:
n_batch, result = callable(max(self.current_batch_size // natoms, 1), start_index)
except OutOfMemoryError as e:
# TODO: it's very slow to catch OOM error; I don't know what TF is doing here
# but luckily we only need to catch once
self.minimal_not_working_batch_size = min(self.minimal_not_working_batch_size, self.current_batch_size)
if self.maximum_working_batch_size >= self.minimal_not_working_batch_size:
self.maximum_working_batch_size = int(self.minimal_not_working_batch_size / self.factor)
if self.minimal_not_working_batch_size <= natoms:
raise OutOfMemoryError("The callable still throws an out-of-memory (OOM) error even when batch size is 1!") from e
# adjust the next batch size
self._adjust_batch_size(1./self.factor)
return 0, None
else:
n_tot = n_batch * natoms
self.maximum_working_batch_size = max(self.maximum_working_batch_size, n_tot)
# adjust the next batch size
if n_tot + natoms > self.current_batch_size and self.current_batch_size * self.factor < self.minimal_not_working_batch_size:
self._adjust_batch_size(self.factor)
return n_batch, result
def _adjust_batch_size(self, factor: float):
old_batch_size = self.current_batch_size
self.current_batch_size = int(self.current_batch_size * factor)
logging.info("Adjust batch size from %d to %d" % (old_batch_size, self.current_batch_size))
def execute_all(self, callable: Callable, total_size: int, natoms: int, *args, **kwargs) -> Tuple[np.ndarray]:
"""Excuate a method with all given data.
Parameters
----------
callable : Callable
The method should accept *args and **kwargs as input and return the similiar array.
total_size : int
Total size
natoms : int
The number of atoms
**kwargs
If 2D np.ndarray, assume the first axis is batch; otherwise do nothing.
"""
def execute_with_batch_size(batch_size: int, start_index: int) -> Tuple[int, Tuple[np.ndarray]]:
end_index = start_index + batch_size
end_index = min(end_index, total_size)
return (end_index - start_index), callable(
*[(vv[start_index:end_index] if isinstance(vv, np.ndarray) and vv.ndim > 1 else vv) for vv in args],
**{kk: (vv[start_index:end_index] if isinstance(vv, np.ndarray) and vv.ndim > 1 else vv) for kk, vv in kwargs.items()},
)
index = 0
results = []
while index < total_size:
n_batch, result = self.execute(execute_with_batch_size, index, natoms)
if not isinstance(result, tuple):
result = (result,)
index += n_batch
if n_batch:
for rr in result:
rr.reshape((n_batch, -1))
results.append(result)
r = tuple([np.concatenate(r, axis=0) for r in zip(*results)])
if len(r) == 1:
# avoid returning tuple if callable doesn't return tuple
r = r[0]
return r
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment