args.yaml 7.45 KB
Newer Older
yuhai's avatar
yuhai committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# all arguments are flatten into this file
# they can also be splitted into separate files and referenced here
n_iter: 10

# training and testing systems
systems_train: # can also be files that containing system paths
  - ./systems/train.n[1-3]

systems_test: # if empty, use the last system of training set
  - ./systems/valid.n4 
  
# directory setting
workdir: "."
share_folder: "share" # folder that stores all other settings

# scf settings
scf_input: # can also be specified by a separete file
  basis: ccpvdz
  # this is for force training
  # the following properties will be dumped in data folder
  # please refer to https://arxiv.org/abs/2012.14615 for detailed explaination of each fields
  dump_fields: [atom, e_base, e_tot, dm_eig, conv, f_base, f_tot, grad_vx, l_f_delta, l_e_delta]
  verbose: 1
  # parameters that will be passed directly to pyscf Mol class
  mol_args:
    incore_anyway: True
  # parameters that will be passed directly to pyscf SCF class
  scf_args:
    conv_tol: 1e-6
    conv_tol_grad: 3e-2
    level_shift: 0.1
    diis_space: 20
    conv_check: false # pyscf conv_check has a bug

scf_machine: 
  # # of systems that will be in one task, default is 1
  # task corresponds to a set of commands, and is the smallest unit to be tracked
  sub_size: 1 
  # 2 tasks will be gathered into one group and submitted together
  # group correspond to a job submitted to schedule system
  group_size: 2 
  # if larger than 1, run n tasks parallelly in one group (one job)
  ingroup_parallel: 1 
  # the parameters determining the machine settings that the jobs are running on
  dispatcher: 
    # "local" to run on local machine, or "ssh" to run on a remote machine
    context: local 
    # "slurm" to use slurm scheduler system, or "shell" to just use shell
    batch: slurm 
    # only needed when using "ssh" in context
    # pass a dict like {username: USERNAME, password: PASSWORD, work_path: /path/to/tmp/folder}
    remote_profile: null 
  # the parameters determining the resources allocated for each job (group of tasks)
  # only needed when batch is set to "slurm"
  # for shell users, will automatically use all resources available
  resources:
    # only set to larger than 1 if parallel in multiple nodes with `ingroup_parallel`
    # otherwise please keep to 1 since pyscf does not support mpi and can only run on a single node
    numb_node: 1 
    time_limit: '24:00:00'
    cpus_per_task: 8
    mem_limit: 8 #GB
    # environment variables
    envs:
      PYSCF_MAX_MEMORY: 8000 #MB, increase from default 4G to 8G to match the mem_limit above
  # resources for each sub task in jobs (groups of tasks)
  # only needed when ingroup_parallel is larger than 1 
  # the resources are reallocated between parallel tasks
  sub_res: 
    cpus_per_task: 8
  python: "python" # use python in path

# training settings
train_input:
  # model_args is ignored, since this is used as restart
  # see init_train for potential model_args
  data_args: 
    # training batch size, 16 is recommended
    batch_size: 16
    # if larger than 1, n batch will be grouped together to form a larger one
    # final batch size would be group_bath * batch_size
    # only needed when a lot of systems have only one datapoint hence the batch size can only be 1
    group_batch: 1
    # if set to true, will try to find force labels and use them in training
    extra_label: true
    # if set to true, will read the convergence data from conv_name 
    # and only use converged datapoints to train
    conv_filter: true
    conv_name: conv
  # to speed up training, deepks support first normalize the data (preshift and prescale)
  # and do a linear regression on the whole training set as prefitting
  preprocess_args:
    preshift: false # restarting model already shifted. Will not recompute shift value
    prescale: false # same as above
    # prefitting is by default enabled
    prefit_ridge: 1e1 # the ridge factor used in linear regression
    prefit_trainable: false # make the linear regression fixed during the training
  train_args: 
    # the start learning rate, will decay later
    start_lr: 0.0001
    # lr will decay a factor of `decay_rate` every `decay_steps` epoches
    decay_rate: 0.5
    decay_steps: 1000
    # show training results every n epoch
    display_epoch: 100
    # the prefactor multiplied infront of the force part of the loss
    force_factor: 1
    # total number of epoch needed in training
    n_epoch: 5000

train_machine: 
  # for training, no tasks or groups are needed since there's only one task
  # the dispatcher settings are same as above
  dispatcher: 
    context: local
    batch: slurm
    remote_profile: null # use lazy local
  # resources settings are also same as above
  resources:
    time_limit: '24:00:00'
    cpus_per_task: 4
    # using gpu in training, current only support 1
    numb_gpu: 1
    mem_limit: 8 #GB
  python: "python" # use python in path

# init settings
init_model: false # do not use existing model in share_folder/init/model.pth

# the first scf iteration, needed if init_model is false
# possible settings are same as scf_input
init_scf: 
  basis: ccpvdz
  dump_fields: [atom, e_base, e_tot, dm_eig, conv, f_base, f_tot, grad_vx, l_f_delta, l_e_delta]
  verbose: 1
  mol_args:
    incore_anyway: True
  scf_args:
    conv_tol: 1e-8
    conv_check: false # pyscf conv_check has a bug

# the first scf iteration, needed if init_model is false
# most settings are same as scf_input but model_args will be specified here
init_train: 
  # whether to fit element-wise energy constant from the training data
  # will require `dump_fields` contain `atom` if set to true
  fit_elem: false # this is the default
  # necessary as this is init training
  model_args: 
    # the number of *hidden* neurons
    # note the first (n_descriptor) and last (1) layer is not included here
    hidden_sizes: [100, 100, 100]
    # the output will be devided by 100 before comparing with labels, to improve training
    output_scale: 100
    # use skip connection between layers if the sizes are same
    use_resnet: true
    # gelu generally performs better than others
    actv_fn: gelu
    # whether to use a predefined embedding function 
    # to further symmetrize the eigenvalues as descriptors
    # add embedding can make the energy surface smooth, hence improve convergence
    # but may slightly reduce the accuracy (especially in generalization)
    # for water we do not use it, if you encounter convergence problem, set it to
    # embedding: thermal
    embedding: null
    # if `fit_elem` is true, set this will use user defined 
    # element energy constant, instead of fitting from data.
    # can be an absolute path to the file, or a length 2 list
    # containing element charges and constants, like
    # [[1, 8], [-0.08, -0.04]]
    elem_table: null
  # the rest are the same as abpve
  data_args: 
    batch_size: 16
    group_batch: 1
  preprocess_args:
    preshift: true # init model will shift the input descriptors to mean zero
    prescale: false
    prefit_ridge: 1e1
    prefit_trainable: false
  # following are suggested parameters for initial training
  # note in the deepks-kit paper the training curve shown use a different set of parameters
  # the paper parameters take an unnecessary length of time and is no longer suggested
  train_args: 
    decay_rate: 0.95 # 0.96 in paper example training curve
    decay_steps: 300 # 500 in paper example training curve
    display_epoch: 100
    n_epoch: 15000 # 50000 in paper example training curve
    start_lr: 0.0003

# other settings
cleanup: false
strict: true