v1.0.1

d99506f3 · chenzk · 61e92904 · d99506f3 · 61e92904 · 61e92904
Commit d99506f3 authored Dec 03, 2024 by chenzk
20 changed files
--- a/scripts/log_lighteval_to_wandb.py
+++ b/scripts/log_lighteval_to_wandb.py
+"""
+This script use to log evaluation results to wandb.
+python3 log_eval_results_to_wandb.py --eval-path /path/to/eval/results --wandb-project project_name --wandb-name run_name
+The folder that contains the evaluation results should have the following structure:
+- 5000:
+    results_x.json # where x is the ligheval's evaluation number
+- 10000:
+    ...
+...
+"""
+import argparse
+import json
+import os
+from pathlib import Path
+import wandb
+def run(current_path: Path):
+    def compute_avg_acc_of_a_benchmark(data, benchmark_prefix):
+        sum_acc, sum_acc_norm, sum_acc_stderr, sum_acc_norm_stderr, count = 0, 0, 0, 0, 0
+        for key, values in data.items():
+            if f"{benchmark_prefix}:" in key:
+                sum_acc += values["acc"]
+                sum_acc_norm += values["acc_norm"]
+                sum_acc_stderr += values["acc_stderr"]
+                sum_acc_norm_stderr += values["acc_norm_stderr"]
+                count += 1
+        average_acc = sum_acc / count if count else 0
+        return average_acc
+    def compute_avg_acc_of_all_tasks(data):
+        sum_acc, count = 0, 0
+        for _, values in data.items():
+            sum_acc += values["acc"]
+            count += 1
+        average_acc = sum_acc / count if count else 0
+        return average_acc
+    list_checkpoints = os.listdir(current_path)
+    sorted_list_checkpoints = sorted(list_checkpoints, key=int)
+    for item in sorted_list_checkpoints:
+        item_path = os.path.join(current_path, item)
+        if os.path.isdir(item_path):
+            json_files = [f for f in os.listdir(item_path) if f.endswith(".json")]
+            if len(json_files) == 1:
+                json_file_path = os.path.join(item_path, json_files[0])
+                with open(json_file_path, "r") as file:
+                    eval_data = json.load(file)
+                    iteration_step = eval_data["config_general"]["config"]["general"]["step"]
+                    consumed_train_samples = eval_data["config_general"]["config"]["general"]["consumed_train_samples"]
+                    logging_results = {}
+                    for name, data in eval_data["results"].items():
+                        logging_results[f"{name}_acc"] = data["acc"]
+                    logging_results["mmlu:average_acc"] = compute_avg_acc_of_a_benchmark(eval_data["results"], "mmlu")
+                    logging_results["arc:average_acc"] = compute_avg_acc_of_a_benchmark(eval_data["results"], "arc")
+                    logging_results["all:average_acc"] = compute_avg_acc_of_all_tasks(eval_data["results"])
+                    wandb.log(
+                        {
+                            **logging_results,
+                            "iteration_step": iteration_step,
+                            "consumed_train_samples": consumed_train_samples,
+                        }
+                    )
+            elif len(json_files) > 1:
+                print(f"More than one JSON file found in {item_path}. Skipping.")
+            else:
+                print(f"No JSON file found in {item_path}.")
+        print(f"Checkpoint {item} is done. /n")
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--eval-path", type=str, required=True, help="Path of the lighteval's evaluation results")
+    parser.add_argument(
+        "--wandb-project", type=str, help="Path of the lighteval's evaluation results", default="nanotron_evals"
+    )
+    parser.add_argument(
+        "--wandb-name",
+        type=str,
+        required=True,
+        help="Path of the lighteval's evaluation results",
+        default="sanity_evals",
+    )
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = get_args()
+    eval_path = args.eval_path
+    wandb_project = args.wandb_project
+    wandb_name = args.wandb_name
+    wandb.init(
+        project=wandb_project,
+        name=wandb_name,
+        config={"eval_path": eval_path},
+    )
+    run(eval_path)
--- a/serialize/__pycache__/__init__.cpython-310.pyc
+++ b/serialize/__pycache__/__init__.cpython-310.pyc
--- a/serialize/__pycache__/main.cpython-310.pyc
+++ b/serialize/__pycache__/main.cpython-310.pyc
--- a/serialize/__pycache__/metadata.cpython-310.pyc
+++ b/serialize/__pycache__/metadata.cpython-310.pyc
--- a/serialize/__pycache__/optimizer.cpython-310.pyc
+++ b/serialize/__pycache__/optimizer.cpython-310.pyc
--- a/serialize/__pycache__/random.cpython-310.pyc
+++ b/serialize/__pycache__/random.cpython-310.pyc
--- a/serialize/__pycache__/utils.cpython-310.pyc
+++ b/serialize/__pycache__/utils.cpython-310.pyc
--- a/serialize/__pycache__/weights.cpython-310.pyc
+++ b/serialize/__pycache__/weights.cpython-310.pyc
--- a/smollm1/README.md
+++ b/smollm1/README.md
+# Pre-training
+We use [nanotron](https://github.com/huggingface/nanotron/) library for training SmolLM and SmolLM2 base models.
+The scripts for training SmolLM v1 can be found in the `smollm1` folder. SmolLM2 has a similar architecture and setup but uses an improved data mixture that we curated and significantly longer training periods (11 trillion tokens for the 1.7B, 4 trillion for the 360M and 2 trillion for the 135M). We will upload the SmolLM2 configs soon.
+## Setup
+Please refer to [nanotron](https://github.com/huggingface/nanotron/) for detailed instructions on setting up your training environment and launching jobs.
+After setting up the environment and tokenizing the training datasets with [datatrove](https://github.com/huggingface/datatrove) (instructions available [here](https://github.com/huggingface/nanotron/blob/main/docs/nanoset.md#nanosets)), you can modify the configurations to match your number of nodes and local paths.
+Below is an example of launching SmolLM1 135M training on 1 node (you can change the DP value to 8 in the config and adjust the batch size) and run:
+```bash
+git clone https://github.com/huggingface/nanotron
+cd nanotron
+# follow installation
+CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=8 run_train.py --config-file smollm1/config_smollm1_135M.yaml
+```
+If you are working on a slurm cluster, you can modify the `launch.slurm` and launch the training with:
+```bash
+sbatch launch.slurm
+```
+> [!NOTE]
+> Don't forget to create the logs directory before launching the job:
--- a/smollm1/config_smollm1_135M.yaml
+++ b/smollm1/config_smollm1_135M.yaml
+# SmolLM1 135M trained on 600B tokens
+checkpoints:
+  checkpoint_interval: 2000
+  checkpoints_path: checkpoints
+  checkpoints_path_is_shared_file_system: false
+  resume_checkpoint_path: null
+  save_final_state: false
+  save_initial_state: false
+data_stages:
+- data:
+    dataset:
+      dataset_folder: # paths to tokenized datasets
+        - datasets/fineweb-edu-dedup
+        - datasets/cosmopedia-v2
+        - datasets/python-edu
+        - datasets/open-web-math
+        - datasets/stackoverflow
+      dataset_weights:
+        - 0.7
+        - 0.15
+        - 0.08
+        - 0.06
+        - 0.01
+    num_loading_workers: 1
+    seed: 42
+  name: training stage
+  start_training_step: 1
+general:
+  benchmark_csv_path: null
+  consumed_train_samples: null
+  ignore_sanity_checks: true
+  project: smollm
+  run: smollm-135M
+  seed: 8
+  step: null
+logging:
+  iteration_step_info_interval: 1
+  log_level: info
+  log_level_replica: info
+model:
+  ddp_bucket_cap_mb: 25
+  dtype: bfloat16
+  init_method:
+    std: 0.0416 # 1/sqrt(hidden_size)
+  make_vocab_size_divisible_by: 1
+  model_config:
+    bos_token_id: 0
+    eos_token_id: 0
+    hidden_act: silu
+    hidden_size: 576
+    initializer_range: 0.02
+    intermediate_size: 1536
+    is_llama_config: true
+    max_position_embeddings: 2048
+    num_attention_heads: 9
+    num_hidden_layers: 30
+    num_key_value_heads: 3
+    pad_token_id: null
+    pretraining_tp: 1
+    rms_norm_eps: 1.0e-05
+    rope_scaling: null
+    rope_theta: 10000.0
+    tie_word_embeddings: true
+    use_cache: true
+    vocab_size: 49152
+optimizer:
+  accumulate_grad_in_fp32: true
+  clip_grad: 1.0
+  learning_rate_scheduler:
+    learning_rate: 0.003
+    lr_decay_starting_step: 250000
+    lr_decay_steps: 50000
+    lr_decay_style: 1-sqrt
+    lr_warmup_steps: 2500
+    lr_warmup_style: linear
+    min_decay_lr: 0
+  optimizer_factory:
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-08
+    name: adamW
+    torch_adam_is_fused: true
+  weight_decay: 0.01
+  zero_stage: 0
+parallelism:
+  dp: 32 # 4 nodes
+  expert_parallel_size: 1
+  pp: 1
+  pp_engine: 1f1b
+  recompute_layer: false
+  tp: 1
+  tp_linear_async_communication: true
+  tp_mode: REDUCE_SCATTER
+  tp_recompute_allgather: true
+profiler: null
+tokenizer:
+  tokenizer_max_length: null
+  tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
+  tokenizer_revision: null
+tokens:
+  batch_accumulation_per_replica: 2
+  limit_test_batches: 0
+  limit_val_batches: 0
+  micro_batch_size: 8 # GBS = 8*2*32*sequence_length = 512*sequence_length = 1M tokens
+  sequence_length: 2048
+  train_steps: 600000
+  val_check_interval: -1
\ No newline at end of file
--- a/smollm1/config_smollm1_135M_demo1.yaml
+++ b/smollm1/config_smollm1_135M_demo1.yaml
+# SmolLM1 135M trained on 600B tokens
+checkpoints:
+  checkpoint_interval: 2000
+  checkpoints_path: checkpoints
+  checkpoints_path_is_shared_file_system: false
+  resume_checkpoint_path: null
+  save_final_state: false
+  save_initial_state: false
+data_stages:
+- data:
+    dataset:
+      dataset_folder: # paths to tokenized datasets
+        - datasets/fineweb-edu-dedup-ds
+        - datasets/fineweb-edu-dedup-ds
+        - datasets/fineweb-edu-dedup-ds
+        - datasets/fineweb-edu-dedup-ds
+        - datasets/fineweb-edu-dedup-ds
+      dataset_weights:
+        - 0.7
+        - 0.15
+        - 0.08
+        - 0.06
+        - 0.01
+    num_loading_workers: 1
+    seed: 42
+  name: training stage
+  start_training_step: 1
+general:
+  benchmark_csv_path: null
+  consumed_train_samples: null
+  ignore_sanity_checks: true
+  project: smollm
+  run: smollm-135M
+  seed: 8
+  step: null
+logging:
+  iteration_step_info_interval: 1
+  log_level: info
+  log_level_replica: info
+model:
+  ddp_bucket_cap_mb: 25
+  dtype: bfloat16
+  init_method:
+    std: 0.0416 # 1/sqrt(hidden_size)
+  make_vocab_size_divisible_by: 1
+  model_config:
+    bos_token_id: 0
+    eos_token_id: 0
+    hidden_act: silu
+    hidden_size: 576
+    initializer_range: 0.02
+    intermediate_size: 1536
+    is_llama_config: true
+    max_position_embeddings: 2048
+    num_attention_heads: 9
+    num_hidden_layers: 30
+    num_key_value_heads: 3
+    pad_token_id: null
+    pretraining_tp: 1
+    rms_norm_eps: 1.0e-05
+    rope_scaling: null
+    rope_theta: 10000.0
+    tie_word_embeddings: true
+    use_cache: true
+    vocab_size: 49152
+optimizer:
+  accumulate_grad_in_fp32: true
+  clip_grad: 1.0
+  learning_rate_scheduler:
+    learning_rate: 0.003
+    lr_decay_starting_step: 250000
+    lr_decay_steps: 50000
+    lr_decay_style: 1-sqrt
+    lr_warmup_steps: 2500
+    lr_warmup_style: linear
+    min_decay_lr: 0
+  optimizer_factory:
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-08
+    name: adamW
+    torch_adam_is_fused: true
+  weight_decay: 0.01
+  zero_stage: 0
+parallelism:
+  dp: 1 # 4 nodes
+  expert_parallel_size: 1
+  pp: 1
+  pp_engine: 1f1b
+  recompute_layer: false
+  tp: 1
+  tp_linear_async_communication: true
+  tp_mode: REDUCE_SCATTER
+  tp_recompute_allgather: true
+profiler: null
+tokenizer:
+  tokenizer_max_length: null
+  tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
+  tokenizer_revision: null
+tokens:
+  batch_accumulation_per_replica: 2
+  limit_test_batches: 0
+  limit_val_batches: 0
+  micro_batch_size: 8 # GBS = 8*2*32*sequence_length = 512*sequence_length = 1M tokens
+  sequence_length: 2048
+  train_steps: 2000
+  val_check_interval: -1
--- a/smollm1/config_smollm1_135M_demo2.yaml
+++ b/smollm1/config_smollm1_135M_demo2.yaml
+# SmolLM1 135M trained on 600B tokens
+checkpoints:
+  checkpoint_interval: 2000
+  checkpoints_path: checkpoints
+  checkpoints_path_is_shared_file_system: false
+  resume_checkpoint_path: null
+  save_final_state: false
+  save_initial_state: false
+data_stages:
+- data:
+    dataset:
+      dataset_overwrite_cache: false
+      dataset_processing_num_proc_per_process: 1
+      hf_dataset_config_name: null
+      hf_dataset_or_datasets: datasets/fineweb-edu-dedup # paths to tokenized datasets
+      hf_dataset_splits: train
+      text_column_name: text
+    num_loading_workers: 1
+    seed: 42
+  name: training stage
+  start_training_step: 1
+general:
+  benchmark_csv_path: null
+  consumed_train_samples: null
+  ignore_sanity_checks: true
+  project: smollm
+  run: smollm-135M
+  seed: 8
+  step: null
+logging:
+  iteration_step_info_interval: 1
+  log_level: info
+  log_level_replica: info
+model:
+  ddp_bucket_cap_mb: 25
+  dtype: bfloat16
+  init_method:
+    std: 0.0416 # 1/sqrt(hidden_size)
+  make_vocab_size_divisible_by: 1
+  model_config:
+    bos_token_id: 0
+    eos_token_id: 0
+    hidden_act: silu
+    hidden_size: 576
+    initializer_range: 0.02
+    intermediate_size: 1536
+    is_llama_config: true
+    max_position_embeddings: 2048
+    num_attention_heads: 9
+    num_hidden_layers: 30
+    num_key_value_heads: 3
+    pad_token_id: null
+    pretraining_tp: 1
+    rms_norm_eps: 1.0e-05
+    rope_scaling: null
+    rope_theta: 10000.0
+    tie_word_embeddings: true
+    use_cache: true
+    vocab_size: 49152
+optimizer:
+  accumulate_grad_in_fp32: true
+  clip_grad: 1.0
+  learning_rate_scheduler:
+    learning_rate: 0.003
+    lr_decay_starting_step: 250000
+    lr_decay_steps: 50000
+    lr_decay_style: 1-sqrt
+    lr_warmup_steps: 2500
+    lr_warmup_style: linear
+    min_decay_lr: 0
+  optimizer_factory:
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-08
+    name: adamW
+    torch_adam_is_fused: true
+  weight_decay: 0.01
+  zero_stage: 0
+parallelism:
+  dp: 1 # 4 nodes
+  expert_parallel_size: 1
+  pp: 1
+  pp_engine: 1f1b
+  recompute_layer: false
+  tp: 1
+  tp_linear_async_communication: true
+  tp_mode: REDUCE_SCATTER
+  tp_recompute_allgather: true
+profiler: null
+tokenizer:
+  tokenizer_max_length: null
+  tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
+  tokenizer_revision: null
+tokens:
+  batch_accumulation_per_replica: 2
+  limit_test_batches: 0
+  limit_val_batches: 0
+  micro_batch_size: 8 # GBS = 8*2*32*sequence_length = 512*sequence_length = 1M tokens
+  sequence_length: 2048
+  train_steps: 2000
+  val_check_interval: -1
--- a/smollm1/config_smollm1_1B.yaml
+++ b/smollm1/config_smollm1_1B.yaml
+# SmolLM1 1.7B trained on 1T tokens
+checkpoints:
+  checkpoint_interval: 2000
+  checkpoints_path: checkpoints
+  checkpoints_path_is_shared_file_system: false
+  resume_checkpoint_path: null
+  save_final_state: false
+  save_initial_state: false
+data_stages:
+- data:
+    dataset:
+      dataset_folder: # paths to tokenized datasets
+        - datasets/fineweb-edu-dedup
+        - datasets/cosmopedia-v2
+        - datasets/open-web-math
+        - datasets/starcoderdata-python
+        - datasets/stackoverflow
+      dataset_weights:
+        - 0.7
+        - 0.15
+        - 0.06
+        - 0.08
+        - 0.01
+    num_loading_workers: 1
+    seed: 42
+  name: training stage
+  start_training_step: 1
+- data:
+    dataset: # we change data mixture to use python-edu
+      dataset_folder: 
+        - datasets/fineweb-edu-dedup
+        - datasets/cosmopedia-v2
+        - datasets/open-web-math
+        - datasets/python-edu
+        - datasets/stackoverflow
+        - datasets/deepmind_mathematics
+      dataset_weights:
+        - 0.7
+        - 0.15
+        - 0.055
+        - 0.08
+        - 0.01
+        - 0.005
+    num_loading_workers: 1
+    seed: 42
+  name: training stage 2
+  start_training_step: 300000
+general:
+  benchmark_csv_path: null
+  consumed_train_samples: null
+  ignore_sanity_checks: true
+  project: smollm
+  run: smollm-1700M
+  seed: 8
+  step: null
+logging:
+  iteration_step_info_interval: 1
+  log_level: info
+  log_level_replica: info
+model:
+  ddp_bucket_cap_mb: 25
+  dtype: bfloat16
+  init_method:
+    std: 0.022097086912079608
+  make_vocab_size_divisible_by: 1
+  model_config:
+    bos_token_id: 0
+    eos_token_id: 0
+    hidden_act: silu
+    hidden_size: 2048
+    initializer_range: 0.02
+    intermediate_size: 8192
+    is_llama_config: true
+    max_position_embeddings: 2048
+    num_attention_heads: 32
+    num_hidden_layers: 24
+    num_key_value_heads: 32
+    pad_token_id: null
+    pretraining_tp: 1
+    rms_norm_eps: 1.0e-05
+    rope_scaling: null
+    rope_theta: 10000.0
+    tie_word_embeddings: true
+    use_cache: true
+    vocab_size: 49152
+optimizer:
+  accumulate_grad_in_fp32: true
+  clip_grad: 1.0
+  learning_rate_scheduler:
+    learning_rate: 0.0005
+    lr_decay_starting_step: 400000
+    lr_decay_steps: 100000
+    lr_decay_style: 1-sqrt
+    lr_warmup_steps: 2000
+    lr_warmup_style: linear
+    min_decay_lr: 0
+  optimizer_factory:
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-08
+    name: adamW
+    torch_adam_is_fused: true
+  weight_decay: 0.01
+  zero_stage: 0
+parallelism:
+  dp: 64 # 8 nodes
+  expert_parallel_size: 1
+  pp: 1
+  pp_engine: 1f1b
+  recompute_layer: false
+  tp: 1
+  tp_linear_async_communication: true
+  tp_mode: REDUCE_SCATTER
+  tp_recompute_allgather: true
+profiler: null
+tokenizer:
+  tokenizer_max_length: null
+  tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
+  tokenizer_revision: null
+tokens:
+  batch_accumulation_per_replica: 4
+  limit_test_batches: 0
+  limit_val_batches: 0
+  micro_batch_size: 4 # GBS = 4*4*64*sequence_length = 1024*sequence_length = 2.1M tokens
+  sequence_length: 2048
+  train_steps: 500000
+  val_check_interval: -1
\ No newline at end of file
--- a/smollm1/config_smollm1_360M.yaml
+++ b/smollm1/config_smollm1_360M.yaml
+# SmolLM1 360M trained on 600B tokens
+checkpoints:
+  checkpoint_interval: 2000
+  checkpoints_path: checkpoints
+  checkpoints_path_is_shared_file_system: false
+  resume_checkpoint_path: null
+  save_final_state: false
+  save_initial_state: false
+data_stages:
+- data:
+    dataset:
+      dataset_folder: # paths to tokenized datasets
+        - datasets/fineweb-edu-dedup
+        - datasets/cosmopedia-v2
+        - datasets/python-edu
+        - datasets/open-web-math
+        - datasets/stackoverflow
+      dataset_weights:
+        - 0.7
+        - 0.15
+        - 0.08
+        - 0.06
+        - 0.01
+    num_loading_workers: 1
+    seed: 42
+  name: training stage
+  start_training_step: 1
+general:
+  benchmark_csv_path: null
+  consumed_train_samples: null
+  ignore_sanity_checks: true
+  project: smollm
+  run: smollm-360M
+  seed: 8
+  step: null
+logging:
+  iteration_step_info_interval: 1
+  log_level: info
+  log_level_replica: info
+model:
+  ddp_bucket_cap_mb: 25
+  dtype: bfloat16
+  init_method:
+    std: 0.03227486121839514
+  make_vocab_size_divisible_by: 1
+  model_config:
+    bos_token_id: 0
+    eos_token_id: 0
+    hidden_act: silu
+    hidden_size: 960
+    initializer_range: 0.02
+    intermediate_size: 2560
+    is_llama_config: true
+    max_position_embeddings: 2048
+    num_attention_heads: 15
+    num_hidden_layers: 32
+    num_key_value_heads: 5
+    pad_token_id: null
+    pretraining_tp: 1
+    rms_norm_eps: 1.0e-05
+    rope_scaling: null
+    rope_theta: 10000.0
+    tie_word_embeddings: true
+    use_cache: true
+    vocab_size: 49152
+optimizer:
+  accumulate_grad_in_fp32: true
+  clip_grad: 1.0
+  learning_rate_scheduler:
+    learning_rate: 0.003
+    lr_decay_starting_step: 500000
+    lr_decay_steps: 100000
+    lr_decay_style: 1-sqrt
+    lr_warmup_steps: 5000
+    lr_warmup_style: linear
+    min_decay_lr: 0
+  optimizer_factory:
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-08
+    name: adamW
+    torch_adam_is_fused: true
+  weight_decay: 0.01
+  zero_stage: 0
+parallelism:
+  dp: 32
+  expert_parallel_size: 1
+  pp: 1
+  pp_engine: 1f1b
+  recompute_layer: false
+  tp: 1
+  tp_linear_async_communication: true
+  tp_mode: REDUCE_SCATTER
+  tp_recompute_allgather: true
+profiler: null
+tokenizer:
+  tokenizer_max_length: null
+  tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
+  tokenizer_revision: null
+tokens:
+  batch_accumulation_per_replica: 2
+  limit_test_batches: 0
+  limit_val_batches: 0
+  micro_batch_size: 8
+  sequence_length: 2048
+  train_steps: 600000
+  val_check_interval: -1
\ No newline at end of file
--- a/__init__.py
+++ b/__init__.py
--- a/config/__init__.py
+++ b/config/__init__.py
--- a/config/config.py
+++ b/config/config.py
--- a/config/lighteval_config.py
+++ b/config/lighteval_config.py
--- a/config/models_config.py
+++ b/config/models_config.py
--- a/config/parallelism_config.py
+++ b/config/parallelism_config.py