config_mamba.yaml

checkpoints:
  checkpoint_interval: 10
  checkpoints_path: /fsx/ferdinandmom/ferdinand-hf/brrr/nanotron/examples/checkpoints
  checkpoints_path_is_shared_file_system: false
  resume_checkpoint_path: null
  save_initial_state: false

data_stages:
  - name: General purpose training
    start_training_step: 1
    data:
      dataset:
        dataset_overwrite_cache: false
        dataset_processing_num_proc_per_process: 24
        hf_dataset_config_name: null
        hf_dataset_or_datasets:
          roneneldan/TinyStories: 1.0
        hf_dataset_splits: train
        text_column_name: text
      num_loading_workers: 1
      seed: 42
general:
  benchmark_csv_path: null
  consumed_train_samples: null
  ignore_sanity_checks: true
  project: test
  run: mamba
  seed: 42
  step: null
lighteval: null
logging:
  iteration_step_info_interval: 1
  log_level: info
  log_level_replica: info
model:
  ddp_bucket_cap_mb: 25
  dtype: bfloat16
  init_method:
    initializer_range: 0.02
    n_residuals_per_layer: 1
    rescale_prenorm_residual: true
  make_vocab_size_divisible_by: 1
  model_config:
    d_model: 1536
    dtype: bfloat16
    fused_add_norm: true
    is_mamba_config: true
    num_hidden_layers: 48
    pad_token_id: null
    pad_vocab_size_multiple: 8
    residual_in_fp32: true
    rms_norm: true
    rms_norm_eps: 1.0e-05
    ssm_cfg:
      bias: false
      conv_bias: true
      d_conv: 4
      d_state: 16
      dt_init: random
      dt_init_floor: 0.0001
      dt_max: 0.1
      dt_min: 0.001
      dt_rank: auto
      dt_scale: 1.0
      expand: 2
      use_fast_path: true
    vocab_size: 50277
optimizer:
  accumulate_grad_in_fp32: true
  adam_beta1: 0.9
  adam_beta2: 0.95
  adam_eps: 1.0e-08
  clip_grad: 1.0
  learning_rate_scheduler:
    learning_rate: 0.0003
    lr_decay_starting_step: null
    lr_decay_steps: 90
    lr_decay_style: cosine
    lr_warmup_steps: 10
    lr_warmup_style: linear
    min_decay_lr: 1.0e-05
  torch_adam_is_fused: true
  weight_decay: 0.01
  zero_stage: 0
parallelism:
  dp: 2
  expert_parallel_size: 1
  pp: 2
  pp_engine: 1f1b
  tp: 2
  tp_linear_async_communication: false
  tp_mode: ALL_REDUCE
profiler: null
tokenizer:
  tokenizer_max_length: null
  tokenizer_name_or_path: gpt2
  tokenizer_revision: null
tokens:
  batch_accumulation_per_replica: 1
  limit_test_batches: 0
  limit_val_batches: 0
  micro_batch_size: 2
  sequence_length: 2048
  train_steps: 100
  val_check_interval: -1