v1.0.3

71e79847 · chenzk · 71e79847 · 71e79847 · 71e79847 · 71e79847
Commit 71e79847 authored Dec 03, 2024 by chenzk
20 changed files
--- a/examples/mup/assets/scale-across-depth.png
+++ b/examples/mup/assets/scale-across-depth.png
--- a/examples/mup/assets/scale-across-width.png
+++ b/examples/mup/assets/scale-across-width.png
--- a/examples/mup/configs/mup_350m_llama_config.yaml
+++ b/examples/mup/configs/mup_350m_llama_config.yaml
+checkpoints:
+  checkpoint_interval: 10000
+  checkpoints_path: checkpoints
+  checkpoints_path_is_shared_file_system: false
+  resume_checkpoint_path: null
+  save_initial_state: false
+
+data_stages:
+  - name: Stable Training Stage
+    start_training_step: 1
+    data:
+      dataset:
+        dataset_overwrite_cache: false
+        dataset_processing_num_proc_per_process: 1
+        hf_dataset_config_name: null
+        hf_dataset_or_datasets: roneneldan/TinyStories
+        hf_dataset_splits: train
+        text_column_name: text
+      num_loading_workers: 1
+      seed: 42
+  - name: Annealing Phase
+    start_training_step: 9000
+    data:
+      dataset:
+        dataset_overwrite_cache: false
+        dataset_processing_num_proc_per_process: 1
+        hf_dataset_config_name: null
+        hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
+        hf_dataset_splits: train
+        text_column_name: completion
+      num_loading_workers: 1
+      seed: 42
+
+general:
+  benchmark_csv_path: null
+  consumed_train_samples: null
+  ignore_sanity_checks: true
+  project: debug
+  run: llama_350m_mup
+  seed: 42
+  step: null
+logging:
+  iteration_step_info_interval: 1
+  log_level: debug
+  log_level_replica: info
+model:
+  ddp_bucket_cap_mb: 120
+  dtype: bfloat16
+  init_method:
+    use_mup: true
+  make_vocab_size_divisible_by: 1
+  model_config:
+    bos_token_id: 1
+    eos_token_id: 2
+    hidden_act: silu
+    initializer_range: 0.02
+
+    hidden_size: 1024
+    intermediate_size: 4096
+    num_hidden_layers: 14
+
+    is_llama_config: true
+    max_position_embeddings: 1024
+    num_attention_heads: 8
+    num_key_value_heads: 4
+    pad_token_id: null
+    pretraining_tp: 1
+    rms_norm_eps: 1.0e-05
+    rope_scaling: null
+    tie_word_embeddings: false
+    use_cache: true
+    vocab_size: 49152
+optimizer:
+  accumulate_grad_in_fp32: false
+  adam_beta1: 0.9
+  adam_beta2: 0.95
+  adam_eps: 1.0e-08
+  clip_grad: 1.0
+  learning_rate_scheduler:
+    learning_rate: 0.001
+    lr_decay_starting_step: null
+    lr_decay_steps: null
+    lr_decay_style: cosine
+    lr_warmup_steps: 100 # 10% warm up of total training steps
+    lr_warmup_style: linear
+    min_decay_lr: 1.0e-05
+  torch_adam_is_fused: true
+  weight_decay: 0.1
+  zero_stage: 0
+parallelism:
+  dp: 4
+  pp: 1
+  pp_engine: 1f1b
+  tp: 2
+  tp_linear_async_communication: true
+  tp_mode: REDUCE_SCATTER
+profiler: null
+tokenizer:
+  tokenizer_max_length: null
+  tokenizer_name_or_path: gpt2
+  tokenizer_revision: null
+tokens:
+  batch_accumulation_per_replica: 8
+  limit_test_batches: 0
+  limit_val_batches: 0
+  micro_batch_size: 32
+  sequence_length: 1024
+  train_steps: 440
+  val_check_interval: -1
+lighteval:
+  batch_size: 16
+  checkpoints_path: null
+  generation: null
+  logging:
+    hub_repo_details: null
+    hub_repo_results: null
+    # hub_repo_tensorboard: HuggingFaceBR4/fmom-mamba2
+    local_output_path: /fsx/phuc/new_workspace/experiments/mup_for_mamba2/test_mamba350M_tp4_917cfc66/logs
+    push_details_to_hub: null
+    push_results_to_hub: null
+    push_results_to_tensorboard: true
+    tensorboard_metric_prefix: e
+  parallelism:
+    dp: 2
+    expert_parallel_size: 1
+    pp: 1
+    pp_engine: 1f1b
+    tp: 2
+    tp_linear_async_communication: false
+    tp_mode: ALL_REDUCE
+  # slurm_script_dir: /fsx/phuc/new_workspace/experiments/mup_for_mamba2/test_mamba350M_tp4_917cfc66/lighteval/slurm_scripts
+  # slurm_template: /fsx/phuc/new_workspace/experiments/mup_for_mamba2/test_mamba350M_tp4_917cfc66/run_eval.slurm.jinja
+  tasks:
+    # custom_tasks: brrr.lighteval.custom_tasks
+    dataset_loading_processes: 8
+    max_samples: 1000
+    multichoice_continuations_start_space: null
+    no_multichoice_continuations_start_space: null
+    num_fewshot_seeds: null
+    tasks: early-signal
+  wandb: null
--- a/examples/mup/configs/sp_350m_llama_config.yaml
+++ b/examples/mup/configs/sp_350m_llama_config.yaml
+checkpoints:
+  checkpoint_interval: 10000
+  checkpoints_path: checkpoints
+  checkpoints_path_is_shared_file_system: false
+  resume_checkpoint_path: null
+  save_initial_state: false
+
+data_stages:
+  - name: Stable Training Stage
+    start_training_step: 1
+    data:
+      dataset:
+        dataset_overwrite_cache: false
+        dataset_processing_num_proc_per_process: 1
+        hf_dataset_config_name: null
+        hf_dataset_or_datasets: roneneldan/TinyStories
+        hf_dataset_splits: train
+        text_column_name: text
+      num_loading_workers: 1
+      seed: 42
+  - name: Annealing Phase
+    start_training_step: 9000
+    data:
+      dataset:
+        dataset_overwrite_cache: false
+        dataset_processing_num_proc_per_process: 1
+        hf_dataset_config_name: null
+        hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
+        hf_dataset_splits: train
+        text_column_name: completion
+      num_loading_workers: 1
+      seed: 42
+
+general:
+  benchmark_csv_path: null
+  consumed_train_samples: null
+  ignore_sanity_checks: true
+  project: debug
+  run: llama_350m_sp
+  seed: 42
+  step: null
+lighteval: null
+logging:
+  iteration_step_info_interval: 1
+  log_level: info
+  log_level_replica: info
+model:
+  ddp_bucket_cap_mb: 120
+  dtype: bfloat16
+  init_method:
+    std: 0.03125 # 1/sqrt(1024)=0.022097086912079608
+  make_vocab_size_divisible_by: 1
+  model_config:
+    bos_token_id: 1
+    eos_token_id: 2
+    hidden_act: silu
+    initializer_range: 0.02
+    hidden_size: 1024
+    intermediate_size: 4096
+    num_hidden_layers: 14
+    is_llama_config: true
+    max_position_embeddings: 1024
+    num_attention_heads: 8
+    num_key_value_heads: 4
+    pad_token_id: null
+    pretraining_tp: 1
+    rms_norm_eps: 1.0e-05
+    rope_scaling: null
+    tie_word_embeddings: false
+    use_cache: true
+    vocab_size: 49152
+optimizer:
+  accumulate_grad_in_fp32: false
+  adam_beta1: 0.9
+  adam_beta2: 0.95
+  adam_eps: 1.0e-08
+  clip_grad: 1.0
+  learning_rate_scheduler:
+    learning_rate: 0.001
+    lr_decay_starting_step: null
+    lr_decay_steps: null
+    lr_decay_style: cosine
+    lr_warmup_steps: 100 # 10% warm up of total training steps
+    lr_warmup_style: linear
+    min_decay_lr: 1.0e-05
+  torch_adam_is_fused: true
+  weight_decay: 0.1
+  zero_stage: 0
+parallelism:
+  dp: 4
+  pp: 1
+  pp_engine: 1f1b
+  tp: 2
+  tp_linear_async_communication: true
+  tp_mode: REDUCE_SCATTER
+profiler: null
+tokenizer:
+  tokenizer_max_length: null
+  tokenizer_name_or_path: gpt2
+  tokenizer_revision: null
+tokens:
+  batch_accumulation_per_replica: 8
+  limit_test_batches: 0
+  limit_val_batches: 0
+  micro_batch_size: 32
+  sequence_length: 1024
+  train_steps: 440
+  val_check_interval: -1
--- a/examples/train_tiny_llama.sh
+++ b/examples/train_tiny_llama.sh
+#!/bin/bash
+
+# Simple script to create a tiny llama model and train it
+
+set -e -x
+
+# Create the YAML config file
+
+EXAMPLE_PATH=$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)
+REPO_PATH=$(dirname $EXAMPLE_PATH)
+python $EXAMPLE_PATH/config_tiny_llama.py
+
+# Setup from environment variables
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export FI_PROVIDER="efa"
+
+python -u -m torch.distributed.run \
+    --nproc_per_node 8 \
+    --nnodes 1 \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    $REPO_PATH/run_train.py --config-file $EXAMPLE_PATH/config_tiny_llama.yaml
--- a/git-lfs_install.sh
+++ b/git-lfs_install.sh
+wget https://mirror.ghproxy.com/https://github.com/git-lfs/git-lfs/releases/download/v3.5.1/git-lfs-linux-amd64-v3.5.1.tar.gz
+tar -xzvf git-lfs-linux-amd64-v3.5.1.tar.gz
+./git-lfs-3.5.1/install.sh
+rm -rf git-lfs-3.5.1 git-lfs-linux-amd64-v3.5.1.tar.gz
--- a/icon.png
+++ b/icon.png
--- a/infer.sh
+++ b/infer.sh
+torchrun --nproc_per_node=1 run_generate.py --ckpt-path checkpoints/10/ --tp 1 --pp 1
--- a/launch.slurm
+++ b/launch.slurm
+#!/bin/bash
+#SBATCH --job-name=smollm1-135M
+#SBATCH --nodes=4
+#SBATCH --gres=gpu:8
+#SBATCH --qos=high
+#SBATCH --output=./logs/train-%j.out
+#SBATCH --error=./logs/train-%j.err
+
+set -e
+
+TRAINER_PYTHON_FILE="run_train.py"
+CONFIG_PATH_YAML="smollm1/config_smollm1_135M.yaml"
+nvidia-smi
+
+# Show some environment variables
+echo python3 version = `python3 --version`
+echo "Python path: $(which python3)"
+echo "NCCL version: $(python -c "import torch;print(torch.cuda.nccl.version())")"
+echo "CUDA version: $(python -c "import torch;print(torch.version.cuda)")"
+
+echo "START TIME: $(date)"
+secs_to_human() {
+    echo "$(( ${1} / 3600 )):$(( (${1} / 60) % 60 )):$(( ${1} % 60 ))"
+}
+start=$(date +%s)
+echo "$(date -d @${start} "+%Y-%m-%d %H:%M:%S"): ${SLURM_JOB_NAME} start id=${SLURM_JOB_ID}\n"
+
+# SLURM stuff
+export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_PORT=6000
+export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
+
+export CUDA_DEVICE_MAX_CONNECTIONS="1"
+
+echo "Number of nodes: $COUNT_NODE"
+echo "Hostnames: $HOSTNAMES"
+
+CMD=" $TRAINER_PYTHON_FILE \
+    --config-file $CONFIG_PATH_YAML \
+    "
+export LAUNCHER="torchrun \
+    --nproc_per_node 8 \
+    --nnodes $COUNT_NODE \
+    --node_rank $SLURM_PROCID \
+    --role $SLURMD_NODENAME: \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+# Wait a random number between 0 and 1000 (milliseconds) to avoid too many concurrent requests to the hub
+random_milliseconds=$(( RANDOM % 1001 ))
+sleep_time=$(bc <<< "scale=3; $random_milliseconds / 1000")
+echo "Sleeping for $sleep_time seconds..."
+sleep $sleep_time
+
+srun $SRUN_ARGS -u bash -c "$LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD"
+
+echo "END TIME: $(date)"
\ No newline at end of file
--- a/model.properties
+++ b/model.properties
+# 模型编码
+modelCode=1128
+# 模型名称
+modelName=nanotron_pytorch
+# 模型描述
+modelDescription=彻底开源预训练大模型，本项目能够预训练出超出qwen2.5、llama3效果的大语言模型，为一些人工智能大厂的训练代码。
+# 应用场景
+appScenario=推理,训练,对话问答,制造,广媒,金融,能源,医疗,家居,教育
+# 框架类型
+frameType=pytorch
--- a/openwebtext-10k.tar.xz
+++ b/openwebtext-10k.tar.xz
--- a/pyproject.toml
+++ b/pyproject.toml
+[project]
+name = "nanotron"
+version = "0.4"
+description = "Minimalistic Large Language Model Training and Finetuning"
+authors = [
+    {name = "Nouamane Tazi", email="nouamane@huggingface.co"},
+    {name = "Thomas Wang", email="thomas.wang@huggingface.co"},
+    {name = "Kunhao Zheng", email="kunhao@huggingface.co"},
+    {name = "Thomas Wolf", email="thomas@huggingface.co"},
+]
+readme = "README.md"
+requires-python = "~=3.10"
+classifiers = [
+    "Topic :: Software Development"
+]
+dependencies = [
+    "torch>=1.13.1",
+    "pyyaml",
+    "numpy",
+    "packaging",
+    "safetensors",
+    "dacite",
+    "tqdm",
+    "datasets",
+]
+
+[tool.setuptools.packages.find]
+where = ["src"]  # list of folders that contain the packages (["."] by default)
+
+[tool.ruff]
+line-length = 119
+ignore = ["C901","E501"] # E501 ignores length violation handled by black
+select = ["C","E","F","I","W"]
+ignore-init-module-imports = true
+
+[project.optional-dependencies]
+dev = [
+    "pre-commit",
+    "pylint"
+]
+
+test = [
+    "pytest",
+    "pytest-xdist"
+]
+
+fast-modeling = [
+    "flash-attn>=2.5.0",
+]
+
+nanosets = [
+     "transformers",
+     "datatrove[io,processing]@git+https://github.com/huggingface/datatrove",
+     "numba",
+]
+
+s3 = [
+    "boto3",
+    "s3fs",
+    "s5cmd",
+]
+
+[build-system]
+requires = [
+    "setuptools",
+]
+
+[pytest]
+norecursedirs="tests/helpers"
--- a/requirements.txt
+++ b/requirements.txt
+torch>=1.13.1
+pyyaml
+numpy
+packaging
+safetensors
+dacite
+tqdm
+datasets
+flash-attn
+setuptools
+dacite==1.8.1
+fsspec==2024.9.0
+numba
+datatrove[all] # 0.3.0
+transformers==4.46.3
+tokenizers==0.20.3
--- a/dummy-tokenizer-wordlevel @ ae57c419
+++ b/dummy-tokenizer-wordlevel @ ae57c419
+Subproject commit ae57c419a98ae4ddf991c6a1af4a8ce94745f45c
--- a/run_generate.py
+++ b/run_generate.py
+"""
+Nanotron Inference Script
+
+Usage:
+```
+export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
+torchrun --nproc_per_node=4 run_generate.py ---ckpt-path checkpoints/test/4
+```
+"""
+
+import argparse
+import os
+from pathlib import Path
+
+import torch
+from nanotron import distributed as dist
+from nanotron import logging
+from nanotron.config import (
+    GenerationArgs,
+    LoggingArgs,
+    ParallelismArgs,
+    get_config_from_file,
+)
+from nanotron.generation.decode import (
+    GenerationInput,
+    TokenizerConfig,
+    decode_text,
+    decode_tokenized,
+)
+from nanotron.logging import log_rank, set_ranks_logging_level
+from nanotron.models import build_model
+from nanotron.parallel import ParallelContext
+from nanotron.parallel.parameters import sanity_check
+from nanotron.parallel.pipeline_parallel.engine import (
+    OneForwardOneBackwardPipelineEngine,
+)
+from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
+from nanotron.parallel.tensor_parallel.enum import TensorParallelLinearMode
+from nanotron.random import (
+    RandomStates,
+    get_current_random_state,
+    get_synced_random_state,
+    set_random_seed,
+)
+from nanotron.serialize import load_weights
+from nanotron.trainer import CONFIG_TO_MODEL_CLASS, mark_tied_parameters
+
+try:
+    from transformers import AutoTokenizer
+except ImportError:
+    AutoTokenizer = None
+
+logger = logging.get_logger(__name__)
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ckpt-path", type=Path, required=True, help="Checkpoint path")
+    parser.add_argument("--dp", type=int, default=1)
+    parser.add_argument("--pp", type=int, default=0)
+    parser.add_argument("--tp", type=int, default=0)
+    parser.add_argument("--max-new-tokens", type=int, default=128, help="Maximum number of new tokens to generate")
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+
+    assert args.ckpt_path.exists(), f"Checkpoint path {args.ckpt_path} does not exist"
+
+    config = get_config_from_file((args.ckpt_path / "config.yaml").as_posix())
+    model_config = config.model.model_config
+    tokenizer_path = config.tokenizer.tokenizer_name_or_path
+
+    parallel_config = ParallelismArgs(
+        dp=args.dp or config.parallelism.dp,
+        pp=args.pp or config.parallelism.pp,
+        tp=args.tp or config.parallelism.tp,
+        pp_engine=OneForwardOneBackwardPipelineEngine(),
+        tp_mode=TensorParallelLinearMode.ALL_REDUCE,
+        tp_linear_async_communication=False,
+    )
+
+    # Initialise all process groups
+    parallel_context = ParallelContext(
+        data_parallel_size=parallel_config.dp,
+        pipeline_parallel_size=parallel_config.pp,
+        tensor_parallel_size=parallel_config.tp,
+    )
+
+    # Set log levels
+    logging_config = LoggingArgs(
+        log_level="info",
+        log_level_replica="info",
+    )
+
+    # Set log levels
+    set_ranks_logging_level(parallel_context=parallel_context, logging_config=logging_config)
+
+    log_rank(f"model_config: {model_config}", logger=logger, level=logging.INFO, rank=0)
+    log_rank(f"tokenizer_path: {tokenizer_path}", logger=logger, level=logging.INFO, rank=0)
+
+    dtype = torch.bfloat16
+
+    # Set random states
+    set_random_seed(42)
+
+    model_config_cls = model_config.__class__.__name__
+    if model_config_cls not in CONFIG_TO_MODEL_CLASS:
+        raise ValueError(
+            f"Unsupported model config {model_config_cls}. Only {CONFIG_TO_MODEL_CLASS.keys()} are supported"
+        )
+
+    # Get synchronized random states
+    if parallel_config.tp_mode is TensorParallelLinearMode.ALL_REDUCE:
+        random_states = RandomStates(
+            {"tp_synced": get_synced_random_state(random_state=get_current_random_state(), pg=parallel_context.tp_pg)}
+        )
+    else:
+        # We don't need to sync across TP when using sequence parallel (REDUCE_SCATTER)
+        random_states = RandomStates({})
+
+    model = build_model(
+        model_builder=lambda: CONFIG_TO_MODEL_CLASS[model_config_cls](
+            config=model_config,
+            parallel_context=parallel_context,
+            parallel_config=parallel_config,
+            random_states=random_states,
+        ),
+        dtype=dtype,
+        parallel_context=parallel_context,
+    )
+
+    # Mark some parameters as tied
+    # TODO @nouamane: this is only needed for training, can we just mark params as NanotronParameter instead?
+    mark_tied_parameters(model=model, parallel_context=parallel_context, parallel_config=parallel_config)
+
+    # Sanity check model
+    sanity_check(root_module=model)
+
+    # Load checkpoint
+    checkpoint_path = args.ckpt_path
+    log_rank(
+        f"Loading checkpoint from {checkpoint_path}:",
+        logger=logger,
+        level=logging.INFO,
+        rank=0,
+    )
+    load_weights(model=model, parallel_context=parallel_context, root_folder=checkpoint_path)
+
+    model.eval()
+    if AutoTokenizer is not None:
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+        # tokenizer.pad_token_id = tokenizer.eos_token_id
+        if tokenizer.pad_token_id is None:
+            if tokenizer.eos_token_id is not None:
+                tokenizer.pad_token_id = tokenizer.eos_token_id
+            elif getattr(model.config, "pad_token_id", None) is not None:
+                tokenizer.pad_token_id = int(model.config.pad_token_id)
+            elif getattr(model.config, "eos_token_id", None) is not None:
+                tokenizer.pad_token_id = int(model.config.eos_token_id)
+            else:
+                tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+        tokenizer.padding_side = "left"
+        tokenizer.truncation_side = "left"  # TODO @nouamane: do we want this?
+        dummy_inputs = [
+            "The future of AI is",
+            # "Passage: Daniel went back to the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?\nAnswer:",
+            "def fib(n)",
+            # 'Here is an extract from a webpage: "Have you ever experienced heel pain after a heavy physical activity, or even right after a long period of standing? If you regard this as something usual and normal, then think again. Miscalled as heel pain, plantar fasciitis causes these frequent mild pains experienced in the soles of the feet. It is the inflammation and enlargement the plantar fascia tissue that is located in the heels of the feet, stretching to the base of the toes. This tissue is responsible for absorbing shock in the feet and for supporting the arches. It also plays a vital role in foot movements during walking and standing. Many factors such as excessive walking, standing, and running trigger heel pain and plantar fasciitis. A sudden increase in intensity of activities, increase in weight, and abrupt change of footwear also cause the swelling of the ligament. Non-supportive footwear lacking arch cushions and improper and worn out running or training can also lead to the problem. It is also most evident among those". Write an extensive and detailed course unit suitable for a textbook targeted at college students, related to the given extract, within the context of "Medicine". Do not just list concepts, but develop each one in detail before moving to the next, as we prioritize depth of understanding and comprehensive exploration of the subject matter over breadth. Focus on: - Rigor: Ensure in-depth coverage of the concepts/sections. - Engagement: Write with an academic, professional and engaging tone that captivates interest. - Application: Incorporate specific, practical examples, such as proofs in calculus or critical dates and figures in history. Do not include a title or an introduction, simply write the content without headlines and introductory phrases. Do not use images.',
+            # "Advancements in technology will lead to",
+            # "Tomorrow's world is shaped by",
+        ]
+
+        outputs = decode_text(
+            input_iter=(GenerationInput(text=text) for text in dummy_inputs),
+            tokenizer=tokenizer,
+            # TODO @thomasw21: From ModelWithLoss extract the model.
+            model=model.model,
+            parallel_context=parallel_context,
+            max_new_tokens=args.max_new_tokens,
+            max_micro_batch_size=2,
+            generation_config=GenerationArgs(sampler="greedy", use_cache=True),
+            tokenizer_config=TokenizerConfig(max_input_length=None),
+            is_bench=os.environ.get("USE_BENCH", "0") == "1",
+        )
+        for output in outputs:
+            input_ids = output.input_ids
+            generated_ids = output.generation_ids
+            if isinstance(input_ids, TensorPointer):
+                assert isinstance(generated_ids, TensorPointer)
+                continue
+            assert isinstance(generated_ids, torch.Tensor)
+
+            log_rank(
+                f"input: {tokenizer.decode(input_ids, clean_up_tokenization_spaces=False)[:1000]}",
+                logger=logger,
+                level=logging.INFO,
+                rank=0,
+            )
+
+            log_rank(
+                f"generation: {tokenizer.decode(generated_ids[len(input_ids) :], clean_up_tokenization_spaces=False)}",
+                logger=logger,
+                level=logging.INFO,
+                rank=0,
+            )
+
+            log_rank(
+                "--------------------------------------------------",
+                logger=logger,
+                level=logging.INFO,
+                rank=0,
+            )
+    else:
+        outputs = decode_tokenized(
+            input_ids=torch.zeros(1, 1).to(dtype=torch.int64, device="cuda"),
+            input_mask=torch.ones(1, 1).to(dtype=torch.bool, device="cuda"),
+            model=model.model,
+            parallel_context=parallel_context,
+            generation_config=GenerationArgs(sampler="greedy", use_cache=True),
+            max_micro_batch_size=1,
+            max_new_tokens=12,
+            returns_logits=False,
+        )
+        for output in outputs:
+            input_ids = output.input_ids
+            generated_ids = output.generation_ids
+            if isinstance(input_ids, TensorPointer):
+                assert isinstance(generated_ids, TensorPointer)
+                continue
+            assert isinstance(generated_ids, torch.Tensor)
+            log_rank(
+                f"generation: {generated_ids[len(input_ids) :]}",
+                logger=logger,
+                level=logging.INFO,
+                rank=0,
+            )
+
+            log_rank(
+                "--------------------------------------------------",
+                logger=logger,
+                level=logging.INFO,
+                rank=0,
+            )
+
+    dist.barrier()
+
+
+if __name__ == "__main__":
+    main()
--- a/run_train.py
+++ b/run_train.py
+"""
+Nanotron training script.
+
+Usage:
+```
+export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
+torchrun --nproc_per_node=8 run_train.py --config-file examples/config_tiny_llama.yaml
+```
+"""
+import argparse
+from typing import Dict, cast
+
+import numpy as np
+from nanotron import logging
+from nanotron.config import DataArgs, DatasetStageArgs, NanosetDatasetsArgs, PretrainDatasetsArgs
+from nanotron.data.dataloader_builder import build_nanoset_dataloader
+from nanotron.dataloader import (
+    clm_process,
+    dummy_infinite_data_generator,
+    get_datasets,
+    get_train_dataloader,
+)
+from nanotron.helpers import (
+    compute_remain_train_steps_of_a_data_stage_from_ckp,
+    get_consumed_train_samples_of_a_data_stage_from_ckp,
+)
+from nanotron.logging import log_rank
+from nanotron.parallel.pipeline_parallel.utils import get_input_output_pp_ranks
+from nanotron.trainer import DistributedTrainer
+from nanotron.utils import main_rank_first
+from torch.utils.data import DataLoader
+
+try:
+    from huggingface_hub import __version__ as hf_hub_version
+    from transformers import AutoTokenizer
+    from transformers import __version__ as tf_version
+except ImportError:
+    hf_hub_version = None
+    tf_version = None
+
+logger = logging.get_logger(__name__)
+
+
+def get_dataloader_from_data_stage(
+    trainer: DistributedTrainer,
+    data: DataArgs,
+    consumed_train_samples: int,
+    num_remaining_train_steps: int,
+):
+    """
+    Returns a dataloader for a given data stage.
+
+    data: The data configuration for the current stage.
+    consumed_train_samples: The number of samples consumed by the model in the this stage (each stage starts from zero).
+    num_remaining_train_steps: The number of remaining training steps for this stage.
+    """
+    assert consumed_train_samples >= 0, "consumed_train_samples should be greater than 0"
+    assert num_remaining_train_steps >= 0, "num_remaining_train_steps should be greater than 0"
+
+    # First, we need to know which ranks to feed the dataloader to
+    input_pp_rank, output_pp_rank = get_input_output_pp_ranks(model=trainer.model)
+
+    # Case 1: Dummy data generator
+    if data.dataset is None:
+        log_rank("Using dummy data generator", logger=logger, level=logging.INFO, rank=0)
+        dataloader = dummy_infinite_data_generator(
+            micro_batch_size=trainer.micro_batch_size,
+            sequence_length=trainer.sequence_length,
+            input_pp_rank=input_pp_rank,
+            output_pp_rank=output_pp_rank,
+            vocab_size=trainer.model_config.vocab_size,
+            seed=data.seed,
+            parallel_context=trainer.parallel_context,
+        )()
+
+    # Case 2: HuggingFace datasets
+    elif isinstance(data.dataset, PretrainDatasetsArgs):
+        log_rank("Using `datasets` library", logger=logger, level=logging.INFO, rank=0)
+        tokenizer_path = trainer.config.tokenizer.tokenizer_name_or_path
+        log_rank(
+            f"Loading tokenizer from {tokenizer_path} and transformers/hf_hub versions {tf_version, hf_hub_version}",
+            logger=logger,
+            level=logging.INFO,
+            rank=0,
+        )
+
+        # We need to the 1st device to process dataset and cache it, then other devices load from cache
+        with main_rank_first(trainer.parallel_context.world_pg):
+            # TODO @nouamanetazi: this may timeout before 1st device finishes processing dataset. Can we have a ctxmanager to modify timeout?
+            # TODO: generalise to include  for validation/test splits
+
+            # We load the raw dataset
+            raw_dataset = get_datasets(
+                hf_dataset_or_datasets=data.dataset.hf_dataset_or_datasets,
+                hf_dataset_config_name=data.dataset.hf_dataset_config_name,
+                splits=data.dataset.hf_dataset_splits,
+            )["train"]
+
+            tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+            tokenizer.pad_token = tokenizer.eos_token
+            tokenizer.padding_side = "left"
+
+            # Check that tokenizer's vocab size is smaller than the model's vocab size
+            assert (
+                tokenizer.vocab_size <= trainer.model_config.vocab_size
+            ), f"Tokenizer's vocab size ({tokenizer.vocab_size}) is larger than the model's vocab size ({trainer.model_config.vocab_size})"
+
+            # We apply the Causal Language Modeling preprocessing
+            train_dataset = clm_process(
+                raw_dataset=raw_dataset,
+                tokenizer=tokenizer,
+                text_column_name=data.dataset.text_column_name,
+                dataset_processing_num_proc_per_process=data.dataset.dataset_processing_num_proc_per_process,
+                dataset_overwrite_cache=data.dataset.dataset_overwrite_cache,
+                sequence_length=trainer.sequence_length,
+            )
+
+            # We load the processed dataset on the ranks requiring it
+            dataloader = get_train_dataloader(
+                train_dataset=train_dataset,
+                sequence_length=trainer.sequence_length,
+                parallel_context=trainer.parallel_context,
+                input_pp_rank=input_pp_rank,
+                output_pp_rank=output_pp_rank,
+                micro_batch_size=trainer.micro_batch_size,
+                consumed_train_samples=consumed_train_samples,
+                dataloader_num_workers=data.num_loading_workers,
+                seed_worker=data.seed,
+                dataloader_drop_last=True,
+            )
+
+            # Check if we have enough samples for train_steps
+            total_tokens_dataset = len(dataloader.dataset) * trainer.sequence_length
+            num_tokens_needed_for_training = (
+                num_remaining_train_steps * trainer.global_batch_size * trainer.sequence_length
+            )
+            assert num_tokens_needed_for_training <= total_tokens_dataset, (
+                f"Dataset is too small for steps ({total_tokens_dataset} < {num_tokens_needed_for_training}), "
+                f"Try train_steps<={len(dataloader.dataset) // trainer.global_batch_size + trainer.iteration_step}"
+            )
+
+    # Case 3: Nanosets
+    elif isinstance(data.dataset, NanosetDatasetsArgs):
+        # Get tokenizer cardinality
+        tokenizer = AutoTokenizer.from_pretrained(trainer.config.tokenizer.tokenizer_name_or_path)
+        token_size = 4 if len(tokenizer) > np.iinfo(np.uint16).max + 1 else 2
+        del tokenizer
+        # Create Nanoset
+        from nanotron.data.nanoset import Nanoset
+
+        with main_rank_first(trainer.parallel_context.world_pg):
+            train_dataset = Nanoset(
+                dataset_folders=data.dataset.dataset_folder,
+                dataset_weights=data.dataset.dataset_weights,
+                sequence_length=trainer.sequence_length,
+                token_size=token_size,
+                train_split_num_samples=trainer.config.tokens.train_steps * trainer.global_batch_size,
+                random_seed=data.seed,
+            )
+
+        # Prepare dataloader
+        train_dataloader = build_nanoset_dataloader(
+            train_dataset,
+            trainer.sequence_length,
+            parallel_context=trainer.parallel_context,
+            input_pp_rank=input_pp_rank,
+            output_pp_rank=output_pp_rank,
+            micro_batch_size=trainer.micro_batch_size,
+            consumed_train_samples=consumed_train_samples,
+            dataloader_num_workers=data.num_loading_workers,
+            dataloader_drop_last=True,
+        )
+
+        return train_dataloader
+    else:
+        raise ValueError(f"Unhandled case of `self.config.data.dataset`. Got: {data.dataset}")
+
+    return dataloader
+
+
+def get_dataloader(trainer: DistributedTrainer) -> Dict[str, DataLoader]:
+    dataloaders = {}
+
+    for stage_idx, stage in enumerate(trainer.config.data_stages):
+        # NOTE: we only create the dataloader for the first stage,
+        # then we lazy initialize the dataloader for the other stages
+        stage = cast(DatasetStageArgs, stage)
+        consumed_train_samples = get_consumed_train_samples_of_a_data_stage_from_ckp(stage, trainer.metadata)
+        assert (
+            consumed_train_samples is not None
+        ), f"Cannot find consumed_train_samples for stage {stage.start_training_step} in the checkpoint"
+
+        num_remaining_train_steps = compute_remain_train_steps_of_a_data_stage_from_ckp(
+            stage, trainer.config, trainer.metadata
+        )
+        log_rank(
+            f"[Training Plan] Stage {stage.name} has {num_remaining_train_steps} remaining training steps and has consumed {consumed_train_samples} samples",
+            logger=logger,
+            level=logging.INFO,
+            rank=0,
+        )
+
+        dataloader = (
+            get_dataloader_from_data_stage(
+                trainer,
+                stage.data,
+                consumed_train_samples=consumed_train_samples,
+                num_remaining_train_steps=num_remaining_train_steps,
+            )
+            if stage_idx == 0
+            else lambda stage=stage: get_dataloader_from_data_stage(
+                trainer,
+                stage.data,
+                consumed_train_samples=consumed_train_samples,
+                num_remaining_train_steps=num_remaining_train_steps,
+            )
+        )
+        dataloaders[stage.name] = dataloader
+    return dataloaders
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config-file", type=str, required=True, help="Path to the YAML or python config file")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = get_args()
+    config_file = args.config_file
+
+    # Load trainer and data
+    trainer = DistributedTrainer(config_file)
+    dataloader = get_dataloader(trainer)
+
+    # Train
+    trainer.train(dataloader)
--- a/scripts/fix_checkpoint_bad_naming.py
+++ b/scripts/fix_checkpoint_bad_naming.py
+"""Fixes the problem where '{type.value}_{suffix_name}.safetensors' was duplicated in checkpoint files
+
+For example this script will change the following:
+```
+checkpoints/10/model/model/decoder/0/pp_block/attn/o_proj/model_model_weight.safetensors_pp-rank-0-of-1_tp-rank-0-of-2.safetensors
+to
+checkpoints/10/model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-2.safetensors
+```
+
+Example Usage:
+
+python scripts/fix_checkpoint_bad_naming.py /fsx/nouamane/projects/nanotron/checkpoints/10
+"""
+
+import argparse
+import os
+import re
+from pathlib import Path
+
+
+def update_checkpoint(checkpoint_dir: str):
+    print(f"Updating checkpoint in {checkpoint_dir}")
+    for root, _, files in os.walk(checkpoint_dir):
+        for file in files:
+            if file.endswith(".safetensors"):
+                # r'(?<=model)_(model)' means match the string '_model' that is preceded by 'model'
+                if len(re.findall(r"(?<=model)_(model)", file)) == 0:
+                    continue
+                # we remove second _model
+                new_file = re.sub(r"(?<=model)_(model)", "", file)
+                # we would have "model_weight.safetensors_pp-rank-0-of-1_tp-rank-0-of-2.safetensors"
+
+                # let's assert we have two matches of ".safetensors"
+                assert len(re.findall(r".safetensors", new_file)) == 2
+                # then we remove first match
+                new_file = re.sub(r".safetensors", "", new_file, count=1)
+                # so that we get "model_weight_pp-rank-0-of-1_tp-rank-0-of-2.safetensors"
+
+                print(f"Renaming {file} to {new_file}")
+                os.rename(os.path.join(root, file), os.path.join(root, new_file))
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Update checkpoint from 1.3 to 1.4")
+    parser.add_argument("checkpoint_dir", type=Path, help="Path to the checkpoint directory")
+    args = parser.parse_args()
+    update_checkpoint(args.checkpoint_dir)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/log_lighteval_to_wandb.py
+++ b/scripts/log_lighteval_to_wandb.py
+"""
+This script use to log evaluation results to wandb.
+
+python3 log_eval_results_to_wandb.py --eval-path /path/to/eval/results --wandb-project project_name --wandb-name run_name
+
+The folder that contains the evaluation results should have the following structure:
+- 5000:
+    results_x.json # where x is the ligheval's evaluation number
+- 10000:
+    ...
+...
+"""
+import argparse
+import json
+import os
+from pathlib import Path
+
+import wandb
+
+
+def run(current_path: Path):
+    def compute_avg_acc_of_a_benchmark(data, benchmark_prefix):
+        sum_acc, sum_acc_norm, sum_acc_stderr, sum_acc_norm_stderr, count = 0, 0, 0, 0, 0
+        for key, values in data.items():
+            if f"{benchmark_prefix}:" in key:
+                sum_acc += values["acc"]
+                sum_acc_norm += values["acc_norm"]
+                sum_acc_stderr += values["acc_stderr"]
+                sum_acc_norm_stderr += values["acc_norm_stderr"]
+                count += 1
+
+        average_acc = sum_acc / count if count else 0
+        return average_acc
+
+    def compute_avg_acc_of_all_tasks(data):
+        sum_acc, count = 0, 0
+        for _, values in data.items():
+            sum_acc += values["acc"]
+            count += 1
+
+        average_acc = sum_acc / count if count else 0
+        return average_acc
+
+    list_checkpoints = os.listdir(current_path)
+    sorted_list_checkpoints = sorted(list_checkpoints, key=int)
+
+    for item in sorted_list_checkpoints:
+        item_path = os.path.join(current_path, item)
+        if os.path.isdir(item_path):
+            json_files = [f for f in os.listdir(item_path) if f.endswith(".json")]
+            if len(json_files) == 1:
+                json_file_path = os.path.join(item_path, json_files[0])
+
+                with open(json_file_path, "r") as file:
+                    eval_data = json.load(file)
+                    iteration_step = eval_data["config_general"]["config"]["general"]["step"]
+                    consumed_train_samples = eval_data["config_general"]["config"]["general"]["consumed_train_samples"]
+
+                    logging_results = {}
+                    for name, data in eval_data["results"].items():
+                        logging_results[f"{name}_acc"] = data["acc"]
+
+                    logging_results["mmlu:average_acc"] = compute_avg_acc_of_a_benchmark(eval_data["results"], "mmlu")
+                    logging_results["arc:average_acc"] = compute_avg_acc_of_a_benchmark(eval_data["results"], "arc")
+                    logging_results["all:average_acc"] = compute_avg_acc_of_all_tasks(eval_data["results"])
+
+                    wandb.log(
+                        {
+                            **logging_results,
+                            "iteration_step": iteration_step,
+                            "consumed_train_samples": consumed_train_samples,
+                        }
+                    )
+
+            elif len(json_files) > 1:
+                print(f"More than one JSON file found in {item_path}. Skipping.")
+            else:
+                print(f"No JSON file found in {item_path}.")
+
+        print(f"Checkpoint {item} is done. /n")
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--eval-path", type=str, required=True, help="Path of the lighteval's evaluation results")
+    parser.add_argument(
+        "--wandb-project", type=str, help="Path of the lighteval's evaluation results", default="nanotron_evals"
+    )
+    parser.add_argument(
+        "--wandb-name",
+        type=str,
+        required=True,
+        help="Path of the lighteval's evaluation results",
+        default="sanity_evals",
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = get_args()
+    eval_path = args.eval_path
+    wandb_project = args.wandb_project
+    wandb_name = args.wandb_name
+
+    wandb.init(
+        project=wandb_project,
+        name=wandb_name,
+        config={"eval_path": eval_path},
+    )
+
+    run(eval_path)
--- a/smollm1/README.md
+++ b/smollm1/README.md
+# Pre-training
+We use [nanotron](https://github.com/huggingface/nanotron/) library for training SmolLM and SmolLM2 base models.
+
+The scripts for training SmolLM v1 can be found in the `smollm1` folder. SmolLM2 has a similar architecture and setup but uses an improved data mixture that we curated and significantly longer training periods (11 trillion tokens for the 1.7B, 4 trillion for the 360M and 2 trillion for the 135M). We will upload the SmolLM2 configs soon.
+
+## Setup
+
+Please refer to [nanotron](https://github.com/huggingface/nanotron/) for detailed instructions on setting up your training environment and launching jobs.
+
+After setting up the environment and tokenizing the training datasets with [datatrove](https://github.com/huggingface/datatrove) (instructions available [here](https://github.com/huggingface/nanotron/blob/main/docs/nanoset.md#nanosets)), you can modify the configurations to match your number of nodes and local paths.
+
+Below is an example of launching SmolLM1 135M training on 1 node (you can change the DP value to 8 in the config and adjust the batch size) and run:
+
+```bash
+git clone https://github.com/huggingface/nanotron
+cd nanotron
+# follow installation
+CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=8 run_train.py --config-file smollm1/config_smollm1_135M.yaml
+```
+
+If you are working on a slurm cluster, you can modify the `launch.slurm` and launch the training with:
+
+```bash
+sbatch launch.slurm
+```
+> [!NOTE]
+> Don't forget to create the logs directory before launching the job:
--- a/smollm1/config_smollm1_135M.yaml
+++ b/smollm1/config_smollm1_135M.yaml
+# SmolLM1 135M trained on 600B tokens
+checkpoints:
+  checkpoint_interval: 2000
+  checkpoints_path: checkpoints
+  checkpoints_path_is_shared_file_system: false
+  resume_checkpoint_path: null
+  save_final_state: false
+  save_initial_state: false
+data_stages:
+- data:
+    dataset:
+      dataset_folder: # paths to tokenized datasets
+        - datasets/fineweb-edu-dedup
+        - datasets/cosmopedia-v2
+        - datasets/python-edu
+        - datasets/open-web-math
+        - datasets/stackoverflow
+      dataset_weights:
+        - 0.7
+        - 0.15
+        - 0.08
+        - 0.06
+        - 0.01
+    num_loading_workers: 1
+    seed: 42
+  name: training stage
+  start_training_step: 1
+general:
+  benchmark_csv_path: null
+  consumed_train_samples: null
+  ignore_sanity_checks: true
+  project: smollm
+  run: smollm-135M
+  seed: 8
+  step: null
+logging:
+  iteration_step_info_interval: 1
+  log_level: info
+  log_level_replica: info
+model:
+  ddp_bucket_cap_mb: 25
+  dtype: bfloat16
+  init_method:
+    std: 0.0416 # 1/sqrt(hidden_size)
+  make_vocab_size_divisible_by: 1
+  model_config:
+    bos_token_id: 0
+    eos_token_id: 0
+    hidden_act: silu
+    hidden_size: 576
+    initializer_range: 0.02
+    intermediate_size: 1536
+    is_llama_config: true
+    max_position_embeddings: 2048
+    num_attention_heads: 9
+    num_hidden_layers: 30
+    num_key_value_heads: 3
+    pad_token_id: null
+    pretraining_tp: 1
+    rms_norm_eps: 1.0e-05
+    rope_scaling: null
+    rope_theta: 10000.0
+    tie_word_embeddings: true
+    use_cache: true
+    vocab_size: 49152
+optimizer:
+  accumulate_grad_in_fp32: true
+  clip_grad: 1.0
+  learning_rate_scheduler:
+    learning_rate: 0.003
+    lr_decay_starting_step: 250000
+    lr_decay_steps: 50000
+    lr_decay_style: 1-sqrt
+    lr_warmup_steps: 2500
+    lr_warmup_style: linear
+    min_decay_lr: 0
+  optimizer_factory:
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-08
+    name: adamW
+    torch_adam_is_fused: true
+  weight_decay: 0.01
+  zero_stage: 0
+parallelism:
+  dp: 32 # 4 nodes
+  expert_parallel_size: 1
+  pp: 1
+  pp_engine: 1f1b
+  recompute_layer: false
+  tp: 1
+  tp_linear_async_communication: true
+  tp_mode: REDUCE_SCATTER
+  tp_recompute_allgather: true
+profiler: null
+tokenizer:
+  tokenizer_max_length: null
+  tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
+  tokenizer_revision: null
+tokens:
+  batch_accumulation_per_replica: 2
+  limit_test_batches: 0
+  limit_val_batches: 0
+  micro_batch_size: 8 # GBS = 8*2*32*sequence_length = 512*sequence_length = 1M tokens
+  sequence_length: 2048
+  train_steps: 600000
+  val_check_interval: -1
\ No newline at end of file