task: hub_module_url: '' model: bidirectional: true max_sequence_length: 32 logit_scale: 100 logit_margin: 0.3 init_checkpoint: 'the pre-trained BERT checkpoint using the labse vocab.' train_data: drop_remainder: true global_batch_size: 4096 input_path: 'the path to train partition' left_text_fields: ['src_raw'] right_text_fields: ['tgt_raw'] vocab_file: 'the path to vocab.txt' lower_case: false is_training: true seq_length: 32 sharding: false cycle_length: 4 shuffle_buffer_size: 1000 tfds_as_supervised: false tfds_data_dir: '' tfds_name: '' tfds_skip_decoding_feature: '' tfds_split: '' validation_data: block_length: 1 cache: false cycle_length: 4 drop_remainder: false global_batch_size: 32000 input_path: 'the path to validation partition' left_text_fields: ['src_raw'] right_text_fields: ['tgt_raw'] vocab_file: 'the path to vocab.txt' lower_case: false is_training: false seq_length: 32 sharding: true shuffle_buffer_size: 1000 tfds_as_supervised: false tfds_data_dir: '' tfds_name: '' tfds_skip_decoding_feature: '' tfds_split: '' trainer: checkpoint_interval: 1000 eval_tf_function: true max_to_keep: 5 optimizer_config: learning_rate: polynomial: cycle: false decay_steps: 500000 end_learning_rate: 0.0 initial_learning_rate: 1.0e-04 name: PolynomialDecay power: 1.0 type: polynomial optimizer: adamw: amsgrad: false beta_1: 0.9 beta_2: 0.999 epsilon: 1.0e-05 exclude_from_weight_decay: null include_in_weight_decay: null name: AdamWeightDecay weight_decay_rate: 0.0 gradient_clip_norm: 100 type: adamw warmup: polynomial: name: polynomial power: 1 warmup_steps: 5000 type: polynomial steps_per_loop: 1000 summary_interval: 1000 train_tf_function: true train_tf_while_loop: true train_steps: 500000 validation_interval: 1000 validation_steps: 100