Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
DeePMD_bladedisc
Commits
6b33aeb8
Commit
6b33aeb8
authored
Apr 17, 2023
by
zhangqha
Browse files
BladeDISC DeePMD code
parents
Pipeline
#179
canceled with stages
Changes
324
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
545 additions
and
0 deletions
+545
-0
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_map_nvnmd_grad.py
...ux-x86_64-3.6/setuptools/lib/deepmd/op/_map_nvnmd_grad.py
+26
-0
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_matmul_nvnmd_grad.py
...x86_64-3.6/setuptools/lib/deepmd/op/_matmul_nvnmd_grad.py
+17
-0
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_prod_force_grad.py
...x-x86_64-3.6/setuptools/lib/deepmd/op/_prod_force_grad.py
+19
-0
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_prod_force_se_a_grad.py
..._64-3.6/setuptools/lib/deepmd/op/_prod_force_se_a_grad.py
+18
-0
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_prod_force_se_r_grad.py
..._64-3.6/setuptools/lib/deepmd/op/_prod_force_se_r_grad.py
+16
-0
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_prod_virial_grad.py
...-x86_64-3.6/setuptools/lib/deepmd/op/_prod_virial_grad.py
+20
-0
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_prod_virial_se_a_grad.py
...64-3.6/setuptools/lib/deepmd/op/_prod_virial_se_a_grad.py
+19
-0
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_prod_virial_se_r_grad.py
...64-3.6/setuptools/lib/deepmd/op/_prod_virial_se_r_grad.py
+17
-0
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_quantize_nvnmd_grad.py
...6_64-3.6/setuptools/lib/deepmd/op/_quantize_nvnmd_grad.py
+14
-0
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_soft_min_force_grad.py
...6_64-3.6/setuptools/lib/deepmd/op/_soft_min_force_grad.py
+19
-0
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_soft_min_virial_grad.py
..._64-3.6/setuptools/lib/deepmd/op/_soft_min_virial_grad.py
+20
-0
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_tabulate_grad.py
...nux-x86_64-3.6/setuptools/lib/deepmd/op/_tabulate_grad.py
+42
-0
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_tanh2_nvnmd_grad.py
...-x86_64-3.6/setuptools/lib/deepmd/op/_tanh2_nvnmd_grad.py
+24
-0
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_tanh4_nvnmd_grad.py
...-x86_64-3.6/setuptools/lib/deepmd/op/_tanh4_nvnmd_grad.py
+28
-0
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/libdeepmd.so
...ld/linux-x86_64-3.6/setuptools/lib/deepmd/op/libdeepmd.so
+0
-0
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/libop_abi.so
...ld/linux-x86_64-3.6/setuptools/lib/deepmd/op/libop_abi.so
+0
-0
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/libop_grads.so
.../linux-x86_64-3.6/setuptools/lib/deepmd/op/libop_grads.so
+0
-0
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/pkg_config/run_config.ini
...86_64-3.6/setuptools/lib/deepmd/pkg_config/run_config.ini
+12
-0
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/train/__init__.py
.../linux-x86_64-3.6/setuptools/lib/deepmd/train/__init__.py
+1
-0
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/train/run_options.py
...nux-x86_64-3.6/setuptools/lib/deepmd/train/run_options.py
+233
-0
No files found.
Too many changes to show.
To preserve performance only
324 of 324+
files are displayed.
Plain diff
Email patch
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_map_nvnmd_grad.py
0 → 100644
View file @
6b33aeb8
#!/usr/bin/env python3
from
tensorflow.python.framework
import
ops
from
deepmd.env
import
op_module
from
deepmd.env
import
tf
@
ops
.
RegisterGradient
(
"MapNvnmd"
)
def
_MapNvnmdGrad
(
op
,
grad
):
x
=
op
.
inputs
[
0
]
v
=
op
.
inputs
[
1
]
dv
=
op
.
inputs
[
2
]
grad_v
=
op
.
inputs
[
3
]
grad_dv
=
op
.
inputs
[
4
]
prec
=
op
.
get_attr
(
"prec"
)
nbit
=
op
.
get_attr
(
"nbit"
)
y
=
op
.
outputs
[
0
]
dydx
=
op_module
.
map_nvnmd
(
x
,
grad_v
,
grad_dv
,
tf
.
zeros_like
(
v
),
tf
.
zeros_like
(
dv
),
prec
,
nbit
)
dydx
=
op_module
.
quantize_nvnmd
(
dydx
,
0
,
nbit
,
-
1
,
-
1
)
dx
=
tf
.
reshape
(
tf
.
reduce_sum
(
dydx
*
grad
,
axis
=
1
),
[
-
1
,
1
])
d_v
=
None
d_dv
=
None
d_grad_v
=
None
d_grad_dv
=
None
return
[
dx
,
d_v
,
d_dv
,
d_grad_v
,
d_grad_dv
]
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_matmul_nvnmd_grad.py
0 → 100644
View file @
6b33aeb8
#!/usr/bin/env python3
from
tensorflow.python.framework
import
ops
from
deepmd.env
import
op_module
from
deepmd.env
import
tf
@
ops
.
RegisterGradient
(
"MatmulNvnmd"
)
def
_MatmulNvnmdGrad
(
op
,
grad
):
x
=
op
.
inputs
[
0
]
w
=
op
.
inputs
[
1
]
isround
=
op
.
get_attr
(
"isround"
)
nbit1
=
op
.
get_attr
(
"nbit1"
)
nbit2
=
op
.
get_attr
(
"nbit2"
)
nbit3
=
op
.
get_attr
(
"nbit3"
)
dx
=
op_module
.
matmul_nvnmd
(
grad
,
tf
.
transpose
(
w
),
isround
,
nbit2
,
nbit3
,
nbit1
)
dw
=
op_module
.
matmul_nvnmd
(
tf
.
transpose
(
x
),
grad
,
isround
,
nbit2
,
nbit3
,
nbit1
)
return
[
dx
,
dw
]
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_prod_force_grad.py
0 → 100644
View file @
6b33aeb8
#!/usr/bin/env python3
"""
Gradients for prod force.
"""
from
tensorflow.python.framework
import
ops
from
deepmd.env
import
op_grads_module
@
ops
.
RegisterGradient
(
"ProdForce"
)
def
_prod_force_grad_cc
(
op
,
grad
):
net_grad
=
op_grads_module
.
prod_force_grad
(
grad
,
op
.
inputs
[
0
],
op
.
inputs
[
1
],
op
.
inputs
[
2
],
op
.
inputs
[
3
],
op
.
inputs
[
4
],
n_a_sel
=
op
.
get_attr
(
"n_a_sel"
),
n_r_sel
=
op
.
get_attr
(
"n_r_sel"
))
return
[
net_grad
,
None
,
None
,
None
,
None
]
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_prod_force_se_a_grad.py
0 → 100644
View file @
6b33aeb8
#!/usr/bin/env python3
"""
Gradients for prod force.
"""
from
tensorflow.python.framework
import
ops
from
deepmd.env
import
op_grads_module
@
ops
.
RegisterGradient
(
"ProdForceSeA"
)
def
_prod_force_se_a_grad_cc
(
op
,
grad
):
net_grad
=
op_grads_module
.
prod_force_se_a_grad
(
grad
,
op
.
inputs
[
0
],
op
.
inputs
[
1
],
op
.
inputs
[
2
],
op
.
inputs
[
3
],
n_a_sel
=
op
.
get_attr
(
"n_a_sel"
),
n_r_sel
=
op
.
get_attr
(
"n_r_sel"
))
return
[
net_grad
,
None
,
None
,
None
]
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_prod_force_se_r_grad.py
0 → 100644
View file @
6b33aeb8
#!/usr/bin/env python3
"""
Gradients for prod force.
"""
from
tensorflow.python.framework
import
ops
from
deepmd.env
import
op_grads_module
@
ops
.
RegisterGradient
(
"ProdForceSeR"
)
def
_prod_force_se_a_grad_cc
(
op
,
grad
):
net_grad
=
op_grads_module
.
prod_force_se_r_grad
(
grad
,
op
.
inputs
[
0
],
op
.
inputs
[
1
],
op
.
inputs
[
2
],
op
.
inputs
[
3
])
return
[
net_grad
,
None
,
None
,
None
]
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_prod_virial_grad.py
0 → 100644
View file @
6b33aeb8
#!/usr/bin/env python3
"""
Gradients for prod virial.
"""
from
tensorflow.python.framework
import
ops
from
deepmd.env
import
op_grads_module
@
ops
.
RegisterGradient
(
"ProdVirial"
)
def
_prod_virial_grad_cc
(
op
,
grad
,
grad_atom
):
net_grad
=
op_grads_module
.
prod_virial_grad
(
grad
,
op
.
inputs
[
0
],
op
.
inputs
[
1
],
op
.
inputs
[
2
],
op
.
inputs
[
3
],
op
.
inputs
[
4
],
op
.
inputs
[
5
],
n_a_sel
=
op
.
get_attr
(
"n_a_sel"
),
n_r_sel
=
op
.
get_attr
(
"n_r_sel"
))
return
[
net_grad
,
None
,
None
,
None
,
None
,
None
]
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_prod_virial_se_a_grad.py
0 → 100644
View file @
6b33aeb8
#!/usr/bin/env python3
"""
Gradients for prod virial.
"""
from
tensorflow.python.framework
import
ops
from
deepmd.env
import
op_grads_module
@
ops
.
RegisterGradient
(
"ProdVirialSeA"
)
def
_prod_virial_se_a_grad_cc
(
op
,
grad
,
grad_atom
):
net_grad
=
op_grads_module
.
prod_virial_se_a_grad
(
grad
,
op
.
inputs
[
0
],
op
.
inputs
[
1
],
op
.
inputs
[
2
],
op
.
inputs
[
3
],
op
.
inputs
[
4
],
n_a_sel
=
op
.
get_attr
(
"n_a_sel"
),
n_r_sel
=
op
.
get_attr
(
"n_r_sel"
))
return
[
net_grad
,
None
,
None
,
None
,
None
]
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_prod_virial_se_r_grad.py
0 → 100644
View file @
6b33aeb8
#!/usr/bin/env python3
"""
Gradients for prod virial.
"""
from
tensorflow.python.framework
import
ops
from
deepmd.env
import
op_grads_module
@
ops
.
RegisterGradient
(
"ProdVirialSeR"
)
def
_prod_virial_se_a_grad_cc
(
op
,
grad
,
grad_atom
):
net_grad
=
op_grads_module
.
prod_virial_se_r_grad
(
grad
,
op
.
inputs
[
0
],
op
.
inputs
[
1
],
op
.
inputs
[
2
],
op
.
inputs
[
3
],
op
.
inputs
[
4
])
return
[
net_grad
,
None
,
None
,
None
,
None
]
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_quantize_nvnmd_grad.py
0 → 100644
View file @
6b33aeb8
#!/usr/bin/env python3
from
tensorflow.python.framework
import
ops
from
deepmd.env
import
op_module
from
deepmd.env
import
tf
@
ops
.
RegisterGradient
(
"QuantizeNvnmd"
)
def
_QuantizeNvnmdGrad
(
op
,
grad
):
isround
=
op
.
get_attr
(
"isround"
)
nbit1
=
op
.
get_attr
(
"nbit1"
)
nbit2
=
op
.
get_attr
(
"nbit2"
)
nbit3
=
op
.
get_attr
(
"nbit3"
)
dx
=
op_module
.
quantize_nvnmd
(
grad
,
isround
,
nbit2
,
nbit3
,
nbit1
)
return
dx
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_soft_min_force_grad.py
0 → 100644
View file @
6b33aeb8
#!/usr/bin/env python3
"""
Gradients for soft min force
"""
from
tensorflow.python.framework
import
ops
from
deepmd.env
import
op_grads_module
@
ops
.
RegisterGradient
(
"SoftMinForce"
)
def
_soft_min_force_grad_cc
(
op
,
grad
):
net_grad
=
op_grads_module
.
soft_min_force_grad
(
grad
,
op
.
inputs
[
0
],
op
.
inputs
[
1
],
op
.
inputs
[
2
],
op
.
inputs
[
3
],
n_a_sel
=
op
.
get_attr
(
"n_a_sel"
),
n_r_sel
=
op
.
get_attr
(
"n_r_sel"
))
return
[
net_grad
,
None
,
None
,
None
]
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_soft_min_virial_grad.py
0 → 100644
View file @
6b33aeb8
#!/usr/bin/env python3
"""
Gradients for soft min virial.
"""
from
tensorflow.python.framework
import
ops
from
deepmd.env
import
op_grads_module
@
ops
.
RegisterGradient
(
"SoftMinVirial"
)
def
_soft_min_virial_grad_cc
(
op
,
grad
,
grad_atom
):
net_grad
=
op_grads_module
.
soft_min_virial_grad
(
grad
,
op
.
inputs
[
0
],
op
.
inputs
[
1
],
op
.
inputs
[
2
],
op
.
inputs
[
3
],
op
.
inputs
[
4
],
n_a_sel
=
op
.
get_attr
(
"n_a_sel"
),
n_r_sel
=
op
.
get_attr
(
"n_r_sel"
))
return
[
net_grad
,
None
,
None
,
None
,
None
]
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_tabulate_grad.py
0 → 100644
View file @
6b33aeb8
#!/usr/bin/env python3
"""
Gradients for tabulate.
"""
from
tensorflow.python.framework
import
ops
from
deepmd.env
import
op_module
from
deepmd.env
import
tf
# from deepmd.DescrptSeATabulate import last_layer_size
@
ops
.
RegisterGradient
(
"TabulateFusion"
)
@
ops
.
RegisterGradient
(
"TabulateFusionSeA"
)
def
_tabulate_fusion_se_a_grad_cc
(
op
,
dy
):
dy_dx
,
dy_df
=
op_module
.
tabulate_fusion_se_a_grad
(
op
.
inputs
[
0
],
op
.
inputs
[
1
],
op
.
inputs
[
2
],
op
.
inputs
[
3
],
dy
,
op
.
outputs
[
0
])
return
[
None
,
None
,
dy_dx
,
dy_df
]
@
ops
.
RegisterGradient
(
"TabulateFusionGrad"
)
@
ops
.
RegisterGradient
(
"TabulateFusionSeAGrad"
)
def
_tabulate_fusion_se_a_grad_grad_cc
(
op
,
dy
,
dy_
):
dz_dy
=
op_module
.
tabulate_fusion_se_a_grad_grad
(
op
.
inputs
[
0
],
op
.
inputs
[
1
],
op
.
inputs
[
2
],
op
.
inputs
[
3
],
dy
,
dy_
,
op
.
inputs
[
5
])
return
[
None
,
None
,
None
,
None
,
dz_dy
,
None
]
@
ops
.
RegisterGradient
(
"TabulateFusionSeT"
)
def
_tabulate_fusion_se_t_grad_cc
(
op
,
dy
):
dy_dx
,
dy_df
=
op_module
.
tabulate_fusion_se_t_grad
(
op
.
inputs
[
0
],
op
.
inputs
[
1
],
op
.
inputs
[
2
],
op
.
inputs
[
3
],
dy
,
op
.
outputs
[
0
])
return
[
None
,
None
,
dy_dx
,
dy_df
]
@
ops
.
RegisterGradient
(
"TabulateFusionSeTGrad"
)
def
_tabulate_fusion_se_t_grad_grad_cc
(
op
,
dy
,
dy_
):
dz_dy
=
op_module
.
tabulate_fusion_se_t_grad_grad
(
op
.
inputs
[
0
],
op
.
inputs
[
1
],
op
.
inputs
[
2
],
op
.
inputs
[
3
],
dy
,
dy_
,
op
.
inputs
[
5
])
return
[
None
,
None
,
None
,
None
,
dz_dy
,
None
]
@
ops
.
RegisterGradient
(
"TabulateFusionSeR"
)
def
_tabulate_fusion_se_r_grad_cc
(
op
,
dy
):
dy_df
=
op_module
.
tabulate_fusion_se_r_grad
(
op
.
inputs
[
0
],
op
.
inputs
[
1
],
op
.
inputs
[
2
],
dy
,
op
.
outputs
[
0
])
return
[
None
,
None
,
dy_df
]
@
ops
.
RegisterGradient
(
"TabulateFusionSeRGrad"
)
def
_tabulate_fusion_se_r_grad_grad_cc
(
op
,
dy
):
dz_dy
=
op_module
.
tabulate_fusion_se_r_grad_grad
(
op
.
inputs
[
0
],
op
.
inputs
[
1
],
op
.
inputs
[
2
],
dy
,
op
.
inputs
[
4
])
return
[
None
,
None
,
None
,
dz_dy
,
None
]
\ No newline at end of file
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_tanh2_nvnmd_grad.py
0 → 100644
View file @
6b33aeb8
#!/usr/bin/env python3
from
tensorflow.python.framework
import
ops
from
deepmd.env
import
op_module
from
deepmd.env
import
tf
@
ops
.
RegisterGradient
(
"Tanh2Nvnmd"
)
def
_Tanh2NvnmdGrad
(
op
,
grad
):
isround
=
op
.
get_attr
(
"isround"
)
nbit1
=
op
.
get_attr
(
"nbit1"
)
nbit2
=
op
.
get_attr
(
"nbit2"
)
nbit3
=
op
.
get_attr
(
"nbit3"
)
prec
=
2
**
nbit2
x
=
op
.
inputs
[
0
]
x_abs
=
tf
.
abs
(
x
)
x1
=
tf
.
clip_by_value
(
x_abs
,
0
,
2
)
x2
=
tf
.
clip_by_value
(
x_abs
,
0
,
4
)
dydx
=
(
132
-
64
*
x1
-
x2
)
*
0.0078125
if
(
nbit2
>
-
1
):
dydx
=
dydx
+
tf
.
stop_gradient
(
tf
.
floor
(
dydx
*
prec
)
/
prec
-
dydx
)
dx
=
dydx
*
grad
if
(
nbit2
>
-
1
):
dx
=
dx
+
tf
.
stop_gradient
(
tf
.
floor
(
dx
*
prec
)
/
prec
-
dx
)
return
dx
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/_tanh4_nvnmd_grad.py
0 → 100644
View file @
6b33aeb8
#!/usr/bin/env python3
from
tensorflow.python.framework
import
ops
from
deepmd.env
import
op_module
from
deepmd.env
import
tf
@
ops
.
RegisterGradient
(
"Tanh4Nvnmd"
)
def
_Tanh4NvnmdGrad
(
op
,
grad
):
isround
=
op
.
get_attr
(
"isround"
)
nbit1
=
op
.
get_attr
(
"nbit1"
)
nbit2
=
op
.
get_attr
(
"nbit2"
)
nbit3
=
op
.
get_attr
(
"nbit3"
)
prec
=
2
**
nbit2
x
=
op
.
inputs
[
0
]
xc
=
tf
.
clip_by_value
(
x
,
-
2
,
2
)
xa
=
tf
.
abs
(
xc
)
xx
=
xa
*
xa
if
(
nbit2
>
-
1
):
xx
=
xx
+
tf
.
stop_gradient
(
tf
.
floor
(
xx
*
prec
)
/
prec
-
xx
)
#
dydx
=
xx
*
(
xa
/
4
-
3
/
4
)
+
1
if
(
nbit2
>
-
1
):
dydx
=
dydx
+
tf
.
stop_gradient
(
tf
.
floor
(
dydx
*
prec
)
/
prec
-
dydx
)
#
dx
=
dydx
*
grad
if
(
nbit2
>
-
1
):
dx
=
dx
+
tf
.
stop_gradient
(
tf
.
floor
(
dx
*
prec
)
/
prec
-
dx
)
return
dx
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/libdeepmd.so
0 → 100644
View file @
6b33aeb8
File added
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/libop_abi.so
0 → 100644
View file @
6b33aeb8
File added
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/op/libop_grads.so
0 → 100644
View file @
6b33aeb8
File added
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/pkg_config/run_config.ini
0 → 100644
View file @
6b33aeb8
[CONFIG]
INSTALL_PREFIX
=
/root/deepmd-kit/_skbuild/linux-x86_64-3.6/cmake-install
GIT_SUMM
=
v2.1.5-dirty
GIT_HASH
=
6e3d4a62
GIT_DATE
=
2022-09-23 16:10:28 +0800
GIT_BRANCH
=
HEAD
TF_INCLUDE_DIR
=
/usr/local/lib/python3.6/dist-packages/tensorflow/include;/usr/local/lib/python3.6/dist-packages/tensorflow/include
TF_LIBS
=
TF_VERSION = 2.4.0
TF_CXX11_ABI_FLAG
=
0
MODEL_VERSION
=
1.1
DP_VARIANT
=
cpu
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/train/__init__.py
0 → 100644
View file @
6b33aeb8
_skbuild/linux-x86_64-3.6/setuptools/lib/deepmd/train/run_options.py
0 → 100644
View file @
6b33aeb8
"""Module taking care of important package constants."""
import
logging
import
os
from
pathlib
import
Path
from
typing
import
TYPE_CHECKING
,
Dict
,
List
,
Optional
,
Tuple
import
numpy
as
np
from
deepmd.cluster
import
get_resource
from
deepmd.env
import
get_tf_default_nthreads
,
tf
,
GLOBAL_CONFIG
,
global_float_prec
from
deepmd.loggers
import
set_log_handles
if
TYPE_CHECKING
:
import
horovod.tensorflow
as
HVD
__all__
=
[
"WELCOME"
,
"CITATION"
,
"BUILD"
,
"RunOptions"
,
]
log
=
logging
.
getLogger
(
__name__
)
# http://patorjk.com/software/taag. Font:Big"
WELCOME
=
(
# noqa
" _____ _____ __ __ _____ _ _ _ "
,
"| __ \ | __ \ | \/ || __ \ | | (_)| | "
,
"| | | | ___ ___ | |__) || \ / || | | | ______ | | __ _ | |_ "
,
"| | | | / _ \ / _ \| ___/ | |\/| || | | ||______|| |/ /| || __|"
,
"| |__| || __/| __/| | | | | || |__| | | < | || |_ "
,
"|_____/ \___| \___||_| |_| |_||_____/ |_|\_\|_| \__|"
,
)
CITATION
=
(
"Please read and cite:"
,
"Wang, Zhang, Han and E, Comput.Phys.Comm. 228, 178-184 (2018)"
,
)
_sep
=
"
\n
"
BUILD
=
(
f
"installed to:
{
GLOBAL_CONFIG
[
'install_prefix'
]
}
"
,
f
"source :
{
GLOBAL_CONFIG
[
'git_summ'
]
}
"
,
f
"source brach:
{
GLOBAL_CONFIG
[
'git_branch'
]
}
"
,
f
"source commit:
{
GLOBAL_CONFIG
[
'git_hash'
]
}
"
,
f
"source commit at:
{
GLOBAL_CONFIG
[
'git_date'
]
}
"
,
f
"build float prec:
{
global_float_prec
}
"
,
f
"build variant:
{
GLOBAL_CONFIG
[
'dp_variant'
]
}
"
,
f
"build with tf inc:
{
GLOBAL_CONFIG
[
'tf_include_dir'
]
}
"
,
f
"build with tf lib:
{
GLOBAL_CONFIG
[
'tf_libs'
].
replace
(
';'
,
_sep
)
}
"
# noqa
)
class
RunOptions
:
"""Class with inf oon how to run training (cluster, MPI and GPU config).
Attributes
----------
gpus: Optional[List[int]]
list of GPUs if any are present else None
is_chief: bool
in distribured training it is true for tha main MPI process in serail it is
always true
world_size: int
total worker count
my_rank: int
index of the MPI task
nodename: str
name of the node
node_list_ : List[str]
the list of nodes of the current mpirun
my_device: str
deviice type - gpu or cpu
"""
gpus
:
Optional
[
List
[
int
]]
world_size
:
int
my_rank
:
int
nodename
:
str
nodelist
:
List
[
int
]
my_device
:
str
_HVD
:
Optional
[
"HVD"
]
_log_handles_already_set
:
bool
=
False
def
__init__
(
self
,
init_model
:
Optional
[
str
]
=
None
,
init_frz_model
:
Optional
[
str
]
=
None
,
restart
:
Optional
[
str
]
=
None
,
log_path
:
Optional
[
str
]
=
None
,
log_level
:
int
=
0
,
mpi_log
:
str
=
"master"
):
self
.
_try_init_distrib
()
if
all
((
init_model
,
restart
)):
raise
RuntimeError
(
"--init-model and --restart should not be set at the same time"
)
# model init options
self
.
restart
=
restart
self
.
init_model
=
init_model
self
.
init_mode
=
"init_from_scratch"
if
restart
is
not
None
:
self
.
restart
=
os
.
path
.
abspath
(
restart
)
self
.
init_mode
=
"restart"
elif
init_model
is
not
None
:
self
.
init_model
=
os
.
path
.
abspath
(
init_model
)
self
.
init_mode
=
"init_from_model"
elif
init_frz_model
is
not
None
:
self
.
init_frz_model
=
os
.
path
.
abspath
(
init_frz_model
)
self
.
init_mode
=
"init_from_frz_model"
self
.
_setup_logger
(
Path
(
log_path
)
if
log_path
else
None
,
log_level
,
mpi_log
)
@
property
def
is_chief
(
self
):
"""Whether my rank is 0."""
return
self
.
my_rank
==
0
def
print_resource_summary
(
self
):
"""Print build and current running cluster configuration summary."""
log
.
info
(
"---Summary of the training---------------------------------------"
)
if
self
.
is_distrib
:
log
.
info
(
"distributed"
)
log
.
info
(
f
"world size:
{
self
.
world_size
}
"
)
log
.
info
(
f
"my rank:
{
self
.
my_rank
}
"
)
log
.
info
(
f
"node list:
{
self
.
nodelist
}
"
)
log
.
info
(
f
"running on:
{
self
.
nodename
}
"
)
log
.
info
(
f
"computing device:
{
self
.
my_device
}
"
)
env_value
=
os
.
environ
.
get
(
'CUDA_VISIBLE_DEVICES'
,
'unset'
)
log
.
info
(
f
"CUDA_VISIBLE_DEVICES:
{
env_value
}
"
)
log
.
info
(
f
"Count of visible GPU:
{
len
(
self
.
gpus
or
[])
}
"
)
intra
,
inter
=
get_tf_default_nthreads
()
log
.
info
(
f
"num_intra_threads:
{
intra
:
d
}
"
)
log
.
info
(
f
"num_inter_threads:
{
inter
:
d
}
"
)
log
.
info
(
"-----------------------------------------------------------------"
)
def
_setup_logger
(
self
,
log_path
:
Optional
[
Path
],
log_level
:
int
,
mpi_log
:
Optional
[
str
],
):
"""Set up package loggers.
Parameters
----------
log_level: int
logging level
log_path: Optional[str]
path to log file, if None logs will be send only to console. If the parent
directory does not exist it will be automatically created, by default None
mpi_log : Optional[str], optional
mpi log type. Has three options. `master` will output logs to file and
console only from rank==0. `collect` will write messages from all ranks to
one file opened under rank==0 and to console. `workers` will open one log
file for each worker designated by its rank, console behaviour is the same
as for `collect`.
"""
if
not
self
.
_log_handles_already_set
:
if
not
self
.
_HVD
:
mpi_log
=
None
set_log_handles
(
log_level
,
log_path
,
mpi_log
=
mpi_log
)
self
.
_log_handles_already_set
=
True
log
.
debug
(
"Log handles were successfully set"
)
else
:
log
.
warning
(
f
"Log handles have already been set. It is not advisable to "
f
"reset them
{
', especially when runnig with MPI!'
if
self
.
_HVD
else
''
}
"
)
def
_try_init_distrib
(
self
):
try
:
import
horovod.tensorflow
as
HVD
HVD
.
init
()
self
.
is_distrib
=
HVD
.
size
()
>
1
except
ImportError
:
log
.
warning
(
"Switch to serial execution due to lack of horovod module."
)
self
.
is_distrib
=
False
# Do real intialization
if
self
.
is_distrib
:
self
.
_init_distributed
(
HVD
)
self
.
_HVD
=
HVD
else
:
self
.
_init_serial
()
self
.
_HVD
=
None
def
_init_distributed
(
self
,
HVD
:
"HVD"
):
"""Initialize settings for distributed training.
Parameters
----------
HVD : HVD
horovod object
"""
nodename
,
nodelist
,
gpus
=
get_resource
()
self
.
nodename
=
nodename
self
.
nodelist
=
nodelist
self
.
gpus
=
gpus
self
.
my_rank
=
HVD
.
rank
()
self
.
world_size
=
HVD
.
size
()
if
gpus
is
not
None
:
gpu_idx
=
HVD
.
local_rank
()
if
gpu_idx
>=
len
(
gpus
):
raise
RuntimeError
(
'Count of local processes is larger than that of available GPUs!'
)
self
.
my_device
=
f
"gpu:
{
gpu_idx
:
d
}
"
else
:
self
.
my_device
=
"cpu:0"
def
_init_serial
(
self
):
"""Initialize setting for serial training."""
nodename
,
_
,
gpus
=
get_resource
()
self
.
gpus
=
gpus
self
.
world_size
=
1
self
.
my_rank
=
0
self
.
nodename
=
nodename
self
.
nodelist
=
[
nodename
]
if
gpus
is
not
None
:
self
.
my_device
=
"gpu:0"
else
:
self
.
my_device
=
"cpu:0"
self
.
_HVD
=
None
Prev
1
…
12
13
14
15
16
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment