# @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb checkpoint: no_epoch_checkpoints: true save_interval_updates: 50000 keep_interval_updates: 1 distributed_training: distributed_world_size: 16 ddp_backend: legacy_ddp task: _name: masked_lm data: ??? sample_break_mode: complete_doc tokens_per_sample: 512 include_target_tokens: true random_token_prob: 0 leave_unmasked_prob: 0 mask_prob: 0.35 mask_multiple_length: 4 criterion: model dataset: max_tokens: 8192 ignore_unused_valid_subsets: true skip_invalid_size_inputs_valid_test: true optimizer: _name: adam weight_decay: 0.01 adam_betas: (0.9,0.98) adam_eps: 1e-06 lr_scheduler: _name: cosine warmup_updates: 10000 optimization: clip_norm: 5 lr: [0.0002] max_update: 1000000 update_freq: [1] model: _name: data2vec_text head_layers: 2 average_top_k_layers: 10 layer_norm_target_layer: true loss_scale: 1 ema_decay: 0.999 ema_end_decay: 0.9999 ema_anneal_end_step: 300000 loss_beta: 4 ema_transformer_layers_only: true transformer: dropout: 0.1 attention_dropout: 0.1 layernorm_embedding: true activation_fn: gelu no_scale_embedding: true max_source_positions: 512 encoder: embed_dim: 768 ffn_embed_dim: 3072 layers: 12 attention_heads: 12 normalize_before: false learned_pos: true layerdrop: 0