combined.yaml

# all arguments are flatten into this file
# they can also be splitted into separate files and referenced here

# number of iterations to do, can be set to zero for deephf training
n_iter: 5

# training and testing systems
systems_train: # can also be files that containing system paths
  - ../system/batch/set.0[0-5]* # support glob
  - ../system/batch/set.060
  - ../system/batch/set.061
  - ../system/batch/set.062

systems_test: # if empty, use the last system of training set
  - ../system/batch/set.063
  
# directory setting
workdir: "."
share_folder: "share" # folder that stores all other settings

# scf settings
scf_input: # can also be specified by a separete file
  basis: ccpvdz
  # this is for force training
  dump_fields: [e_base, e_tot, dm_eig, conv, f_base, f_tot, grad_vx, l_f_delta, l_e_delta]
  verbose: 1
  mol_args:
    incore_anyway: True
  scf_args:
    conv_tol: 1e-6
    conv_tol_grad: 1e-2
    level_shift: 0.1
    diis_space: 20
    conv_check: false # pyscf conv_check has bug

scf_machine: 
  sub_size: 5 # 5 systems will be in one task, default is 1
  group_size: 2 # 2 tasks will be gathered into one group and submitted together
  ingroup_parallel: 2 # this will set numb_node to 2 in resources
  dispatcher: 
    context: local
    batch: slurm
    remote_profile: null # use lazy local
  resources:
    numb_node: 2 # parallel in two nodes
    time_limit: '24:00:00'
    cpus_per_task: 8
    mem_limit: 8
    envs:
      PYSCF_MAX_MEMORY: 8000 # increase from 4G to 8G
  sub_res: # resources for each sub task
    cpus_per_task: 8
  python: "python" # use python in path

# train settings
train_input:
  # model_args is ignored, since this is used as restart
  data_args: 
    batch_size: 16
    group_batch: 1
    extra_label: true
    conv_filter: true
    conv_name: conv
  preprocess_args:
    preshift: false # restarting model already shifted. Will not recompute shift value
    prescale: false # same as above
    prefit_ridge: 1e1
    prefit_trainable: false
  train_args: 
    decay_rate: 0.5
    decay_steps: 1000
    display_epoch: 100
    force_factor: 0.1
    n_epoch: 5000
    start_lr: 0.0001

train_machine: 
  dispatcher: 
    context: local
    batch: slurm
    remote_profile: null # use lazy local
  resources:
    time_limit: '24:00:00'
    cpus_per_task: 4
    numb_gpu: 1
    mem_limit: 8
  python: "python" # use python in path

# init settings
init_model: false # do not use existing model in share_folder/init/model.pth

init_scf: 
  basis: ccpvdz
  # this is for pure energy training
  dump_fields: [e_base, e_tot, dm_eig, conv, l_e_delta]
  verbose: 1
  mol_args:
    incore_anyway: True
  scf_args:
    conv_tol: 1e-8
    conv_check: false # pyscf conv_check has bug

init_train: 
  model_args: # necessary as this is init training
    hidden_sizes: [200, 200, 200]
    output_scale: 100
    use_resnet: true
    actv_fn: mygelu
  data_args: 
    batch_size: 16
    group_batch: 1
  preprocess_args:
    preshift: true
    prescale: false
    prefit_ridge: 1e1
    prefit_trainable: false
  train_args: 
    decay_rate: 0.96
    decay_steps: 500
    display_epoch: 100
    n_epoch: 50000
    start_lr: 0.0003

# other settings
cleanup: false
strict: true