Commit 6b33aeb8 authored by zhangqha's avatar zhangqha
Browse files

BladeDISC DeePMD code

parents
Pipeline #179 canceled with stages
#!/usr/bin/env python3
from tensorflow.python.framework import ops
from deepmd.env import op_module
from deepmd.env import tf
@ops.RegisterGradient("MapNvnmd")
def _MapNvnmdGrad(op, grad):
x = op.inputs[0]
v = op.inputs[1]
dv = op.inputs[2]
grad_v = op.inputs[3]
grad_dv = op.inputs[4]
prec = op.get_attr("prec")
nbit = op.get_attr("nbit")
y = op.outputs[0]
dydx = op_module.map_nvnmd(x, grad_v, grad_dv, tf.zeros_like(v), tf.zeros_like(dv), prec, nbit)
dydx = op_module.quantize_nvnmd(dydx, 0, nbit, -1, -1)
dx = tf.reshape(tf.reduce_sum(dydx * grad, axis=1), [-1, 1])
d_v = None
d_dv = None
d_grad_v = None
d_grad_dv = None
return [dx, d_v, d_dv, d_grad_v, d_grad_dv]
#!/usr/bin/env python3
from tensorflow.python.framework import ops
from deepmd.env import op_module
from deepmd.env import tf
@ops.RegisterGradient("MatmulNvnmd")
def _MatmulNvnmdGrad(op, grad):
x = op.inputs[0]
w = op.inputs[1]
isround = op.get_attr("isround")
nbit1 = op.get_attr("nbit1")
nbit2 = op.get_attr("nbit2")
nbit3 = op.get_attr("nbit3")
dx = op_module.matmul_nvnmd(grad, tf.transpose(w), isround, nbit2, nbit3, nbit1)
dw = op_module.matmul_nvnmd(tf.transpose(x), grad, isround, nbit2, nbit3, nbit1)
return [dx, dw]
#!/usr/bin/env python3
"""
Gradients for prod force.
"""
from tensorflow.python.framework import ops
from deepmd.env import op_grads_module
@ops.RegisterGradient("ProdForce")
def _prod_force_grad_cc (op, grad):
net_grad = op_grads_module.prod_force_grad (grad,
op.inputs[0],
op.inputs[1],
op.inputs[2],
op.inputs[3],
op.inputs[4],
n_a_sel = op.get_attr("n_a_sel"),
n_r_sel = op.get_attr("n_r_sel"))
return [net_grad, None, None, None, None]
#!/usr/bin/env python3
"""
Gradients for prod force.
"""
from tensorflow.python.framework import ops
from deepmd.env import op_grads_module
@ops.RegisterGradient("ProdForceSeA")
def _prod_force_se_a_grad_cc (op, grad):
net_grad = op_grads_module.prod_force_se_a_grad (grad,
op.inputs[0],
op.inputs[1],
op.inputs[2],
op.inputs[3],
n_a_sel = op.get_attr("n_a_sel"),
n_r_sel = op.get_attr("n_r_sel"))
return [net_grad, None, None, None]
#!/usr/bin/env python3
"""
Gradients for prod force.
"""
from tensorflow.python.framework import ops
from deepmd.env import op_grads_module
@ops.RegisterGradient("ProdForceSeR")
def _prod_force_se_a_grad_cc (op, grad):
net_grad = op_grads_module.prod_force_se_r_grad (grad,
op.inputs[0],
op.inputs[1],
op.inputs[2],
op.inputs[3])
return [net_grad, None, None, None]
#!/usr/bin/env python3
"""
Gradients for prod virial.
"""
from tensorflow.python.framework import ops
from deepmd.env import op_grads_module
@ops.RegisterGradient("ProdVirial")
def _prod_virial_grad_cc (op, grad, grad_atom):
net_grad = op_grads_module.prod_virial_grad (grad,
op.inputs[0],
op.inputs[1],
op.inputs[2],
op.inputs[3],
op.inputs[4],
op.inputs[5],
n_a_sel = op.get_attr("n_a_sel"),
n_r_sel = op.get_attr("n_r_sel"))
return [net_grad, None, None, None, None, None]
#!/usr/bin/env python3
"""
Gradients for prod virial.
"""
from tensorflow.python.framework import ops
from deepmd.env import op_grads_module
@ops.RegisterGradient("ProdVirialSeA")
def _prod_virial_se_a_grad_cc (op, grad, grad_atom):
net_grad = op_grads_module.prod_virial_se_a_grad (grad,
op.inputs[0],
op.inputs[1],
op.inputs[2],
op.inputs[3],
op.inputs[4],
n_a_sel = op.get_attr("n_a_sel"),
n_r_sel = op.get_attr("n_r_sel"))
return [net_grad, None, None, None, None]
#!/usr/bin/env python3
"""
Gradients for prod virial.
"""
from tensorflow.python.framework import ops
from deepmd.env import op_grads_module
@ops.RegisterGradient("ProdVirialSeR")
def _prod_virial_se_a_grad_cc (op, grad, grad_atom):
net_grad = op_grads_module.prod_virial_se_r_grad (grad,
op.inputs[0],
op.inputs[1],
op.inputs[2],
op.inputs[3],
op.inputs[4])
return [net_grad, None, None, None, None]
#!/usr/bin/env python3
from tensorflow.python.framework import ops
from deepmd.env import op_module
from deepmd.env import tf
@ops.RegisterGradient("QuantizeNvnmd")
def _QuantizeNvnmdGrad(op, grad):
isround = op.get_attr("isround")
nbit1 = op.get_attr("nbit1")
nbit2 = op.get_attr("nbit2")
nbit3 = op.get_attr("nbit3")
dx = op_module.quantize_nvnmd(grad, isround, nbit2, nbit3, nbit1)
return dx
#!/usr/bin/env python3
"""
Gradients for soft min force
"""
from tensorflow.python.framework import ops
from deepmd.env import op_grads_module
@ops.RegisterGradient("SoftMinForce")
def _soft_min_force_grad_cc (op, grad):
net_grad = op_grads_module.soft_min_force_grad (grad,
op.inputs[0],
op.inputs[1],
op.inputs[2],
op.inputs[3],
n_a_sel = op.get_attr("n_a_sel"),
n_r_sel = op.get_attr("n_r_sel"))
return [net_grad, None, None, None]
#!/usr/bin/env python3
"""
Gradients for soft min virial.
"""
from tensorflow.python.framework import ops
from deepmd.env import op_grads_module
@ops.RegisterGradient("SoftMinVirial")
def _soft_min_virial_grad_cc (op, grad, grad_atom):
net_grad = op_grads_module.soft_min_virial_grad (grad,
op.inputs[0],
op.inputs[1],
op.inputs[2],
op.inputs[3],
op.inputs[4],
n_a_sel = op.get_attr("n_a_sel"),
n_r_sel = op.get_attr("n_r_sel"))
return [net_grad, None, None, None, None]
#!/usr/bin/env python3
"""
Gradients for tabulate.
"""
from tensorflow.python.framework import ops
from deepmd.env import op_module
from deepmd.env import tf
# from deepmd.DescrptSeATabulate import last_layer_size
@ops.RegisterGradient("TabulateFusion")
@ops.RegisterGradient("TabulateFusionSeA")
def _tabulate_fusion_se_a_grad_cc (op, dy):
dy_dx, dy_df = op_module.tabulate_fusion_se_a_grad(op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[3], dy, op.outputs[0])
return [None, None, dy_dx, dy_df]
@ops.RegisterGradient("TabulateFusionGrad")
@ops.RegisterGradient("TabulateFusionSeAGrad")
def _tabulate_fusion_se_a_grad_grad_cc (op, dy, dy_):
dz_dy = op_module.tabulate_fusion_se_a_grad_grad(op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[3], dy, dy_, op.inputs[5])
return [None, None, None, None, dz_dy, None]
@ops.RegisterGradient("TabulateFusionSeT")
def _tabulate_fusion_se_t_grad_cc (op, dy):
dy_dx, dy_df = op_module.tabulate_fusion_se_t_grad(op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[3], dy, op.outputs[0])
return [None, None, dy_dx, dy_df]
@ops.RegisterGradient("TabulateFusionSeTGrad")
def _tabulate_fusion_se_t_grad_grad_cc (op, dy, dy_):
dz_dy = op_module.tabulate_fusion_se_t_grad_grad(op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[3], dy, dy_, op.inputs[5])
return [None, None, None, None, dz_dy, None]
@ops.RegisterGradient("TabulateFusionSeR")
def _tabulate_fusion_se_r_grad_cc (op, dy):
dy_df = op_module.tabulate_fusion_se_r_grad(op.inputs[0], op.inputs[1], op.inputs[2], dy, op.outputs[0])
return [None, None, dy_df]
@ops.RegisterGradient("TabulateFusionSeRGrad")
def _tabulate_fusion_se_r_grad_grad_cc (op, dy):
dz_dy = op_module.tabulate_fusion_se_r_grad_grad(op.inputs[0], op.inputs[1], op.inputs[2], dy, op.inputs[4])
return [None, None, None, dz_dy, None]
\ No newline at end of file
#!/usr/bin/env python3
from tensorflow.python.framework import ops
from deepmd.env import op_module
from deepmd.env import tf
@ops.RegisterGradient("Tanh2Nvnmd")
def _Tanh2NvnmdGrad(op, grad):
isround = op.get_attr("isround")
nbit1 = op.get_attr("nbit1")
nbit2 = op.get_attr("nbit2")
nbit3 = op.get_attr("nbit3")
prec = 2 ** nbit2
x = op.inputs[0]
x_abs = tf.abs(x)
x1 = tf.clip_by_value(x_abs, 0, 2)
x2 = tf.clip_by_value(x_abs, 0, 4)
dydx = (132-64*x1-x2) * 0.0078125
if (nbit2 > -1):
dydx = dydx + tf.stop_gradient( tf.floor(dydx * prec) / prec - dydx )
dx = dydx * grad
if (nbit2 > -1):
dx = dx + tf.stop_gradient( tf.floor(dx * prec) / prec - dx )
return dx
#!/usr/bin/env python3
from tensorflow.python.framework import ops
from deepmd.env import op_module
from deepmd.env import tf
@ops.RegisterGradient("Tanh4Nvnmd")
def _Tanh4NvnmdGrad(op, grad):
isround = op.get_attr("isround")
nbit1 = op.get_attr("nbit1")
nbit2 = op.get_attr("nbit2")
nbit3 = op.get_attr("nbit3")
prec = 2 ** nbit2
x = op.inputs[0]
xc = tf.clip_by_value(x, -2, 2)
xa = tf.abs(xc)
xx = xa * xa
if (nbit2 > -1):
xx = xx + tf.stop_gradient(tf.floor(xx * prec) / prec - xx)
#
dydx = xx * (xa/4 - 3/4) + 1
if (nbit2 > -1):
dydx = dydx + tf.stop_gradient( tf.floor(dydx * prec) / prec - dydx)
#
dx = dydx * grad
if (nbit2 > -1):
dx = dx + tf.stop_gradient( tf.floor(dx * prec) / prec - dx )
return dx
[CONFIG]
INSTALL_PREFIX = /root/deepmd-kit/_skbuild/linux-x86_64-3.6/cmake-install
GIT_SUMM = v2.1.5-dirty
GIT_HASH = 6e3d4a62
GIT_DATE = 2022-09-23 16:10:28 +0800
GIT_BRANCH = HEAD
TF_INCLUDE_DIR = /usr/local/lib/python3.6/dist-packages/tensorflow/include;/usr/local/lib/python3.6/dist-packages/tensorflow/include
TF_LIBS =
TF_VERSION = 2.4.0
TF_CXX11_ABI_FLAG = 0
MODEL_VERSION=1.1
DP_VARIANT=cpu
"""Module taking care of important package constants."""
import logging
import os
from pathlib import Path
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
import numpy as np
from deepmd.cluster import get_resource
from deepmd.env import get_tf_default_nthreads, tf, GLOBAL_CONFIG, global_float_prec
from deepmd.loggers import set_log_handles
if TYPE_CHECKING:
import horovod.tensorflow as HVD
__all__ = [
"WELCOME",
"CITATION",
"BUILD",
"RunOptions",
]
log = logging.getLogger(__name__)
# http://patorjk.com/software/taag. Font:Big"
WELCOME = ( # noqa
" _____ _____ __ __ _____ _ _ _ ",
"| __ \ | __ \ | \/ || __ \ | | (_)| | ",
"| | | | ___ ___ | |__) || \ / || | | | ______ | | __ _ | |_ ",
"| | | | / _ \ / _ \| ___/ | |\/| || | | ||______|| |/ /| || __|",
"| |__| || __/| __/| | | | | || |__| | | < | || |_ ",
"|_____/ \___| \___||_| |_| |_||_____/ |_|\_\|_| \__|",
)
CITATION = (
"Please read and cite:",
"Wang, Zhang, Han and E, Comput.Phys.Comm. 228, 178-184 (2018)",
)
_sep = "\n "
BUILD = (
f"installed to: {GLOBAL_CONFIG['install_prefix']}",
f"source : {GLOBAL_CONFIG['git_summ']}",
f"source brach: {GLOBAL_CONFIG['git_branch']}",
f"source commit: {GLOBAL_CONFIG['git_hash']}",
f"source commit at: {GLOBAL_CONFIG['git_date']}",
f"build float prec: {global_float_prec}",
f"build variant: {GLOBAL_CONFIG['dp_variant']}",
f"build with tf inc: {GLOBAL_CONFIG['tf_include_dir']}",
f"build with tf lib: {GLOBAL_CONFIG['tf_libs'].replace(';', _sep)}" # noqa
)
class RunOptions:
"""Class with inf oon how to run training (cluster, MPI and GPU config).
Attributes
----------
gpus: Optional[List[int]]
list of GPUs if any are present else None
is_chief: bool
in distribured training it is true for tha main MPI process in serail it is
always true
world_size: int
total worker count
my_rank: int
index of the MPI task
nodename: str
name of the node
node_list_ : List[str]
the list of nodes of the current mpirun
my_device: str
deviice type - gpu or cpu
"""
gpus: Optional[List[int]]
world_size: int
my_rank: int
nodename: str
nodelist: List[int]
my_device: str
_HVD: Optional["HVD"]
_log_handles_already_set: bool = False
def __init__(
self,
init_model: Optional[str] = None,
init_frz_model: Optional[str] = None,
restart: Optional[str] = None,
log_path: Optional[str] = None,
log_level: int = 0,
mpi_log: str = "master"
):
self._try_init_distrib()
if all((init_model, restart)):
raise RuntimeError(
"--init-model and --restart should not be set at the same time"
)
# model init options
self.restart = restart
self.init_model = init_model
self.init_mode = "init_from_scratch"
if restart is not None:
self.restart = os.path.abspath(restart)
self.init_mode = "restart"
elif init_model is not None:
self.init_model = os.path.abspath(init_model)
self.init_mode = "init_from_model"
elif init_frz_model is not None:
self.init_frz_model = os.path.abspath(init_frz_model)
self.init_mode = "init_from_frz_model"
self._setup_logger(Path(log_path) if log_path else None, log_level, mpi_log)
@property
def is_chief(self):
"""Whether my rank is 0."""
return self.my_rank == 0
def print_resource_summary(self):
"""Print build and current running cluster configuration summary."""
log.info("---Summary of the training---------------------------------------")
if self.is_distrib:
log.info("distributed")
log.info(f"world size: {self.world_size}")
log.info(f"my rank: {self.my_rank}")
log.info(f"node list: {self.nodelist}")
log.info(f"running on: {self.nodename}")
log.info(f"computing device: {self.my_device}")
env_value = os.environ.get('CUDA_VISIBLE_DEVICES', 'unset')
log.info(f"CUDA_VISIBLE_DEVICES: {env_value}")
log.info(f"Count of visible GPU: {len(self.gpus or [])}")
intra, inter = get_tf_default_nthreads()
log.info(f"num_intra_threads: {intra:d}")
log.info(f"num_inter_threads: {inter:d}")
log.info("-----------------------------------------------------------------")
def _setup_logger(
self,
log_path: Optional[Path],
log_level: int,
mpi_log: Optional[str],
):
"""Set up package loggers.
Parameters
----------
log_level: int
logging level
log_path: Optional[str]
path to log file, if None logs will be send only to console. If the parent
directory does not exist it will be automatically created, by default None
mpi_log : Optional[str], optional
mpi log type. Has three options. `master` will output logs to file and
console only from rank==0. `collect` will write messages from all ranks to
one file opened under rank==0 and to console. `workers` will open one log
file for each worker designated by its rank, console behaviour is the same
as for `collect`.
"""
if not self._log_handles_already_set:
if not self._HVD:
mpi_log = None
set_log_handles(log_level, log_path, mpi_log=mpi_log)
self._log_handles_already_set = True
log.debug("Log handles were successfully set")
else:
log.warning(
f"Log handles have already been set. It is not advisable to "
f"reset them{', especially when runnig with MPI!' if self._HVD else ''}"
)
def _try_init_distrib(self):
try:
import horovod.tensorflow as HVD
HVD.init()
self.is_distrib = HVD.size() > 1
except ImportError:
log.warning("Switch to serial execution due to lack of horovod module.")
self.is_distrib = False
# Do real intialization
if self.is_distrib:
self._init_distributed(HVD)
self._HVD = HVD
else:
self._init_serial()
self._HVD = None
def _init_distributed(self, HVD: "HVD"):
"""Initialize settings for distributed training.
Parameters
----------
HVD : HVD
horovod object
"""
nodename, nodelist, gpus = get_resource()
self.nodename = nodename
self.nodelist = nodelist
self.gpus = gpus
self.my_rank = HVD.rank()
self.world_size = HVD.size()
if gpus is not None:
gpu_idx = HVD.local_rank()
if gpu_idx >= len(gpus):
raise RuntimeError('Count of local processes is larger than that of available GPUs!')
self.my_device = f"gpu:{gpu_idx:d}"
else:
self.my_device = "cpu:0"
def _init_serial(self):
"""Initialize setting for serial training."""
nodename, _, gpus = get_resource()
self.gpus = gpus
self.world_size = 1
self.my_rank = 0
self.nodename = nodename
self.nodelist = [nodename]
if gpus is not None:
self.my_device = "gpu:0"
else:
self.my_device = "cpu:0"
self._HVD = None
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment