combined.yaml 3.15 KB
Newer Older
yuhai's avatar
yuhai committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# all arguments are flatten into this file
# they can also be splitted into separate files and referenced here

# number of iterations to do, can be set to zero for deephf training
n_iter: 5

# training and testing systems
systems_train: # can also be files that containing system paths
  - ../system/batch/set.0[0-5]* # support glob
  - ../system/batch/set.060
  - ../system/batch/set.061
  - ../system/batch/set.062

systems_test: # if empty, use the last system of training set
  - ../system/batch/set.063
  
# directory setting
workdir: "."
share_folder: "share" # folder that stores all other settings

# scf settings
scf_input: # can also be specified by a separete file
  basis: ccpvdz
  # this is for force training
  dump_fields: [e_base, e_tot, dm_eig, conv, f_base, f_tot, grad_vx, l_f_delta, l_e_delta]
  verbose: 1
  mol_args:
    incore_anyway: True
  scf_args:
    conv_tol: 1e-6
    conv_tol_grad: 1e-2
    level_shift: 0.1
    diis_space: 20
    conv_check: false # pyscf conv_check has bug

scf_machine: 
  sub_size: 5 # 5 systems will be in one task, default is 1
  group_size: 2 # 2 tasks will be gathered into one group and submitted together
  ingroup_parallel: 2 # this will set numb_node to 2 in resources
  dispatcher: 
    context: local
    batch: slurm
    remote_profile: null # use lazy local
  resources:
    numb_node: 2 # parallel in two nodes
    time_limit: '24:00:00'
    cpus_per_task: 8
    mem_limit: 8
    envs:
      PYSCF_MAX_MEMORY: 8000 # increase from 4G to 8G
  sub_res: # resources for each sub task
    cpus_per_task: 8
  python: "python" # use python in path

# train settings
train_input:
  # model_args is ignored, since this is used as restart
  data_args: 
    batch_size: 16
    group_batch: 1
    extra_label: true
    conv_filter: true
    conv_name: conv
  preprocess_args:
    preshift: false # restarting model already shifted. Will not recompute shift value
    prescale: false # same as above
    prefit_ridge: 1e1
    prefit_trainable: false
  train_args: 
    decay_rate: 0.5
    decay_steps: 1000
    display_epoch: 100
    force_factor: 0.1
    n_epoch: 5000
    start_lr: 0.0001

train_machine: 
  dispatcher: 
    context: local
    batch: slurm
    remote_profile: null # use lazy local
  resources:
    time_limit: '24:00:00'
    cpus_per_task: 4
    numb_gpu: 1
    mem_limit: 8
  python: "python" # use python in path

# init settings
init_model: false # do not use existing model in share_folder/init/model.pth

init_scf: 
  basis: ccpvdz
  # this is for pure energy training
  dump_fields: [e_base, e_tot, dm_eig, conv, l_e_delta]
  verbose: 1
  mol_args:
    incore_anyway: True
  scf_args:
    conv_tol: 1e-8
    conv_check: false # pyscf conv_check has bug

init_train: 
  model_args: # necessary as this is init training
    hidden_sizes: [200, 200, 200]
    output_scale: 100
    use_resnet: true
    actv_fn: mygelu
  data_args: 
    batch_size: 16
    group_batch: 1
  preprocess_args:
    preshift: true
    prescale: false
    prefit_ridge: 1e1
    prefit_trainable: false
  train_args: 
    decay_rate: 0.96
    decay_steps: 500
    display_epoch: 100
    n_epoch: 50000
    start_lr: 0.0003

# other settings
cleanup: false
strict: true