Commit 71e79847 authored by chenzk's avatar chenzk
Browse files

v1.0.3

parents
Pipeline #2034 canceled with stages
checkpoints:
checkpoint_interval: 10000
checkpoints_path: checkpoints
checkpoints_path_is_shared_file_system: false
resume_checkpoint_path: null
save_initial_state: false
data_stages:
- name: Stable Training Stage
start_training_step: 1
data:
dataset:
dataset_overwrite_cache: false
dataset_processing_num_proc_per_process: 1
hf_dataset_config_name: null
hf_dataset_or_datasets: roneneldan/TinyStories
hf_dataset_splits: train
text_column_name: text
num_loading_workers: 1
seed: 42
- name: Annealing Phase
start_training_step: 9000
data:
dataset:
dataset_overwrite_cache: false
dataset_processing_num_proc_per_process: 1
hf_dataset_config_name: null
hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
hf_dataset_splits: train
text_column_name: completion
num_loading_workers: 1
seed: 42
general:
benchmark_csv_path: null
consumed_train_samples: null
ignore_sanity_checks: true
project: debug
run: llama_350m_mup
seed: 42
step: null
logging:
iteration_step_info_interval: 1
log_level: debug
log_level_replica: info
model:
ddp_bucket_cap_mb: 120
dtype: bfloat16
init_method:
use_mup: true
make_vocab_size_divisible_by: 1
model_config:
bos_token_id: 1
eos_token_id: 2
hidden_act: silu
initializer_range: 0.02
hidden_size: 1024
intermediate_size: 4096
num_hidden_layers: 14
is_llama_config: true
max_position_embeddings: 1024
num_attention_heads: 8
num_key_value_heads: 4
pad_token_id: null
pretraining_tp: 1
rms_norm_eps: 1.0e-05
rope_scaling: null
tie_word_embeddings: false
use_cache: true
vocab_size: 49152
optimizer:
accumulate_grad_in_fp32: false
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.0e-08
clip_grad: 1.0
learning_rate_scheduler:
learning_rate: 0.001
lr_decay_starting_step: null
lr_decay_steps: null
lr_decay_style: cosine
lr_warmup_steps: 100 # 10% warm up of total training steps
lr_warmup_style: linear
min_decay_lr: 1.0e-05
torch_adam_is_fused: true
weight_decay: 0.1
zero_stage: 0
parallelism:
dp: 4
pp: 1
pp_engine: 1f1b
tp: 2
tp_linear_async_communication: true
tp_mode: REDUCE_SCATTER
profiler: null
tokenizer:
tokenizer_max_length: null
tokenizer_name_or_path: gpt2
tokenizer_revision: null
tokens:
batch_accumulation_per_replica: 8
limit_test_batches: 0
limit_val_batches: 0
micro_batch_size: 32
sequence_length: 1024
train_steps: 440
val_check_interval: -1
lighteval:
batch_size: 16
checkpoints_path: null
generation: null
logging:
hub_repo_details: null
hub_repo_results: null
# hub_repo_tensorboard: HuggingFaceBR4/fmom-mamba2
local_output_path: /fsx/phuc/new_workspace/experiments/mup_for_mamba2/test_mamba350M_tp4_917cfc66/logs
push_details_to_hub: null
push_results_to_hub: null
push_results_to_tensorboard: true
tensorboard_metric_prefix: e
parallelism:
dp: 2
expert_parallel_size: 1
pp: 1
pp_engine: 1f1b
tp: 2
tp_linear_async_communication: false
tp_mode: ALL_REDUCE
# slurm_script_dir: /fsx/phuc/new_workspace/experiments/mup_for_mamba2/test_mamba350M_tp4_917cfc66/lighteval/slurm_scripts
# slurm_template: /fsx/phuc/new_workspace/experiments/mup_for_mamba2/test_mamba350M_tp4_917cfc66/run_eval.slurm.jinja
tasks:
# custom_tasks: brrr.lighteval.custom_tasks
dataset_loading_processes: 8
max_samples: 1000
multichoice_continuations_start_space: null
no_multichoice_continuations_start_space: null
num_fewshot_seeds: null
tasks: early-signal
wandb: null
checkpoints:
checkpoint_interval: 10000
checkpoints_path: checkpoints
checkpoints_path_is_shared_file_system: false
resume_checkpoint_path: null
save_initial_state: false
data_stages:
- name: Stable Training Stage
start_training_step: 1
data:
dataset:
dataset_overwrite_cache: false
dataset_processing_num_proc_per_process: 1
hf_dataset_config_name: null
hf_dataset_or_datasets: roneneldan/TinyStories
hf_dataset_splits: train
text_column_name: text
num_loading_workers: 1
seed: 42
- name: Annealing Phase
start_training_step: 9000
data:
dataset:
dataset_overwrite_cache: false
dataset_processing_num_proc_per_process: 1
hf_dataset_config_name: null
hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
hf_dataset_splits: train
text_column_name: completion
num_loading_workers: 1
seed: 42
general:
benchmark_csv_path: null
consumed_train_samples: null
ignore_sanity_checks: true
project: debug
run: llama_350m_sp
seed: 42
step: null
lighteval: null
logging:
iteration_step_info_interval: 1
log_level: info
log_level_replica: info
model:
ddp_bucket_cap_mb: 120
dtype: bfloat16
init_method:
std: 0.03125 # 1/sqrt(1024)=0.022097086912079608
make_vocab_size_divisible_by: 1
model_config:
bos_token_id: 1
eos_token_id: 2
hidden_act: silu
initializer_range: 0.02
hidden_size: 1024
intermediate_size: 4096
num_hidden_layers: 14
is_llama_config: true
max_position_embeddings: 1024
num_attention_heads: 8
num_key_value_heads: 4
pad_token_id: null
pretraining_tp: 1
rms_norm_eps: 1.0e-05
rope_scaling: null
tie_word_embeddings: false
use_cache: true
vocab_size: 49152
optimizer:
accumulate_grad_in_fp32: false
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.0e-08
clip_grad: 1.0
learning_rate_scheduler:
learning_rate: 0.001
lr_decay_starting_step: null
lr_decay_steps: null
lr_decay_style: cosine
lr_warmup_steps: 100 # 10% warm up of total training steps
lr_warmup_style: linear
min_decay_lr: 1.0e-05
torch_adam_is_fused: true
weight_decay: 0.1
zero_stage: 0
parallelism:
dp: 4
pp: 1
pp_engine: 1f1b
tp: 2
tp_linear_async_communication: true
tp_mode: REDUCE_SCATTER
profiler: null
tokenizer:
tokenizer_max_length: null
tokenizer_name_or_path: gpt2
tokenizer_revision: null
tokens:
batch_accumulation_per_replica: 8
limit_test_batches: 0
limit_val_batches: 0
micro_batch_size: 32
sequence_length: 1024
train_steps: 440
val_check_interval: -1
#!/bin/bash
# Simple script to create a tiny llama model and train it
set -e -x
# Create the YAML config file
EXAMPLE_PATH=$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)
REPO_PATH=$(dirname $EXAMPLE_PATH)
python $EXAMPLE_PATH/config_tiny_llama.py
# Setup from environment variables
export CUDA_DEVICE_MAX_CONNECTIONS=1
export FI_PROVIDER="efa"
python -u -m torch.distributed.run \
--nproc_per_node 8 \
--nnodes 1 \
--rdzv_backend c10d \
--max_restarts 0 \
--tee 3 \
$REPO_PATH/run_train.py --config-file $EXAMPLE_PATH/config_tiny_llama.yaml
wget https://mirror.ghproxy.com/https://github.com/git-lfs/git-lfs/releases/download/v3.5.1/git-lfs-linux-amd64-v3.5.1.tar.gz
tar -xzvf git-lfs-linux-amd64-v3.5.1.tar.gz
./git-lfs-3.5.1/install.sh
rm -rf git-lfs-3.5.1 git-lfs-linux-amd64-v3.5.1.tar.gz
icon.png

53.8 KB

torchrun --nproc_per_node=1 run_generate.py --ckpt-path checkpoints/10/ --tp 1 --pp 1
#!/bin/bash
#SBATCH --job-name=smollm1-135M
#SBATCH --nodes=4
#SBATCH --gres=gpu:8
#SBATCH --qos=high
#SBATCH --output=./logs/train-%j.out
#SBATCH --error=./logs/train-%j.err
set -e
TRAINER_PYTHON_FILE="run_train.py"
CONFIG_PATH_YAML="smollm1/config_smollm1_135M.yaml"
nvidia-smi
# Show some environment variables
echo python3 version = `python3 --version`
echo "Python path: $(which python3)"
echo "NCCL version: $(python -c "import torch;print(torch.cuda.nccl.version())")"
echo "CUDA version: $(python -c "import torch;print(torch.version.cuda)")"
echo "START TIME: $(date)"
secs_to_human() {
echo "$(( ${1} / 3600 )):$(( (${1} / 60) % 60 )):$(( ${1} % 60 ))"
}
start=$(date +%s)
echo "$(date -d @${start} "+%Y-%m-%d %H:%M:%S"): ${SLURM_JOB_NAME} start id=${SLURM_JOB_ID}\n"
# SLURM stuff
export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_PORT=6000
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
export CUDA_DEVICE_MAX_CONNECTIONS="1"
echo "Number of nodes: $COUNT_NODE"
echo "Hostnames: $HOSTNAMES"
CMD=" $TRAINER_PYTHON_FILE \
--config-file $CONFIG_PATH_YAML \
"
export LAUNCHER="torchrun \
--nproc_per_node 8 \
--nnodes $COUNT_NODE \
--node_rank $SLURM_PROCID \
--role $SLURMD_NODENAME: \
--max_restarts 0 \
--tee 3 \
"
# Wait a random number between 0 and 1000 (milliseconds) to avoid too many concurrent requests to the hub
random_milliseconds=$(( RANDOM % 1001 ))
sleep_time=$(bc <<< "scale=3; $random_milliseconds / 1000")
echo "Sleeping for $sleep_time seconds..."
sleep $sleep_time
srun $SRUN_ARGS -u bash -c "$LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD"
echo "END TIME: $(date)"
\ No newline at end of file
# 模型编码
modelCode=1128
# 模型名称
modelName=nanotron_pytorch
# 模型描述
modelDescription=彻底开源预训练大模型,本项目能够预训练出超出qwen2.5、llama3效果的大语言模型,为一些人工智能大厂的训练代码。
# 应用场景
appScenario=推理,训练,对话问答,制造,广媒,金融,能源,医疗,家居,教育
# 框架类型
frameType=pytorch
[project]
name = "nanotron"
version = "0.4"
description = "Minimalistic Large Language Model Training and Finetuning"
authors = [
{name = "Nouamane Tazi", email="nouamane@huggingface.co"},
{name = "Thomas Wang", email="thomas.wang@huggingface.co"},
{name = "Kunhao Zheng", email="kunhao@huggingface.co"},
{name = "Thomas Wolf", email="thomas@huggingface.co"},
]
readme = "README.md"
requires-python = "~=3.10"
classifiers = [
"Topic :: Software Development"
]
dependencies = [
"torch>=1.13.1",
"pyyaml",
"numpy",
"packaging",
"safetensors",
"dacite",
"tqdm",
"datasets",
]
[tool.setuptools.packages.find]
where = ["src"] # list of folders that contain the packages (["."] by default)
[tool.ruff]
line-length = 119
ignore = ["C901","E501"] # E501 ignores length violation handled by black
select = ["C","E","F","I","W"]
ignore-init-module-imports = true
[project.optional-dependencies]
dev = [
"pre-commit",
"pylint"
]
test = [
"pytest",
"pytest-xdist"
]
fast-modeling = [
"flash-attn>=2.5.0",
]
nanosets = [
"transformers",
"datatrove[io,processing]@git+https://github.com/huggingface/datatrove",
"numba",
]
s3 = [
"boto3",
"s3fs",
"s5cmd",
]
[build-system]
requires = [
"setuptools",
]
[pytest]
norecursedirs="tests/helpers"
dummy-tokenizer-wordlevel @ ae57c419
Subproject commit ae57c419a98ae4ddf991c6a1af4a8ce94745f45c
"""
Nanotron Inference Script
Usage:
```
export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
torchrun --nproc_per_node=4 run_generate.py ---ckpt-path checkpoints/test/4
```
"""
import argparse
import os
from pathlib import Path
import torch
from nanotron import distributed as dist
from nanotron import logging
from nanotron.config import (
GenerationArgs,
LoggingArgs,
ParallelismArgs,
get_config_from_file,
)
from nanotron.generation.decode import (
GenerationInput,
TokenizerConfig,
decode_text,
decode_tokenized,
)
from nanotron.logging import log_rank, set_ranks_logging_level
from nanotron.models import build_model
from nanotron.parallel import ParallelContext
from nanotron.parallel.parameters import sanity_check
from nanotron.parallel.pipeline_parallel.engine import (
OneForwardOneBackwardPipelineEngine,
)
from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
from nanotron.parallel.tensor_parallel.enum import TensorParallelLinearMode
from nanotron.random import (
RandomStates,
get_current_random_state,
get_synced_random_state,
set_random_seed,
)
from nanotron.serialize import load_weights
from nanotron.trainer import CONFIG_TO_MODEL_CLASS, mark_tied_parameters
try:
from transformers import AutoTokenizer
except ImportError:
AutoTokenizer = None
logger = logging.get_logger(__name__)
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--ckpt-path", type=Path, required=True, help="Checkpoint path")
parser.add_argument("--dp", type=int, default=1)
parser.add_argument("--pp", type=int, default=0)
parser.add_argument("--tp", type=int, default=0)
parser.add_argument("--max-new-tokens", type=int, default=128, help="Maximum number of new tokens to generate")
return parser.parse_args()
def main():
args = get_args()
assert args.ckpt_path.exists(), f"Checkpoint path {args.ckpt_path} does not exist"
config = get_config_from_file((args.ckpt_path / "config.yaml").as_posix())
model_config = config.model.model_config
tokenizer_path = config.tokenizer.tokenizer_name_or_path
parallel_config = ParallelismArgs(
dp=args.dp or config.parallelism.dp,
pp=args.pp or config.parallelism.pp,
tp=args.tp or config.parallelism.tp,
pp_engine=OneForwardOneBackwardPipelineEngine(),
tp_mode=TensorParallelLinearMode.ALL_REDUCE,
tp_linear_async_communication=False,
)
# Initialise all process groups
parallel_context = ParallelContext(
data_parallel_size=parallel_config.dp,
pipeline_parallel_size=parallel_config.pp,
tensor_parallel_size=parallel_config.tp,
)
# Set log levels
logging_config = LoggingArgs(
log_level="info",
log_level_replica="info",
)
# Set log levels
set_ranks_logging_level(parallel_context=parallel_context, logging_config=logging_config)
log_rank(f"model_config: {model_config}", logger=logger, level=logging.INFO, rank=0)
log_rank(f"tokenizer_path: {tokenizer_path}", logger=logger, level=logging.INFO, rank=0)
dtype = torch.bfloat16
# Set random states
set_random_seed(42)
model_config_cls = model_config.__class__.__name__
if model_config_cls not in CONFIG_TO_MODEL_CLASS:
raise ValueError(
f"Unsupported model config {model_config_cls}. Only {CONFIG_TO_MODEL_CLASS.keys()} are supported"
)
# Get synchronized random states
if parallel_config.tp_mode is TensorParallelLinearMode.ALL_REDUCE:
random_states = RandomStates(
{"tp_synced": get_synced_random_state(random_state=get_current_random_state(), pg=parallel_context.tp_pg)}
)
else:
# We don't need to sync across TP when using sequence parallel (REDUCE_SCATTER)
random_states = RandomStates({})
model = build_model(
model_builder=lambda: CONFIG_TO_MODEL_CLASS[model_config_cls](
config=model_config,
parallel_context=parallel_context,
parallel_config=parallel_config,
random_states=random_states,
),
dtype=dtype,
parallel_context=parallel_context,
)
# Mark some parameters as tied
# TODO @nouamane: this is only needed for training, can we just mark params as NanotronParameter instead?
mark_tied_parameters(model=model, parallel_context=parallel_context, parallel_config=parallel_config)
# Sanity check model
sanity_check(root_module=model)
# Load checkpoint
checkpoint_path = args.ckpt_path
log_rank(
f"Loading checkpoint from {checkpoint_path}:",
logger=logger,
level=logging.INFO,
rank=0,
)
load_weights(model=model, parallel_context=parallel_context, root_folder=checkpoint_path)
model.eval()
if AutoTokenizer is not None:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
# tokenizer.pad_token_id = tokenizer.eos_token_id
if tokenizer.pad_token_id is None:
if tokenizer.eos_token_id is not None:
tokenizer.pad_token_id = tokenizer.eos_token_id
elif getattr(model.config, "pad_token_id", None) is not None:
tokenizer.pad_token_id = int(model.config.pad_token_id)
elif getattr(model.config, "eos_token_id", None) is not None:
tokenizer.pad_token_id = int(model.config.eos_token_id)
else:
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
tokenizer.padding_side = "left"
tokenizer.truncation_side = "left" # TODO @nouamane: do we want this?
dummy_inputs = [
"The future of AI is",
# "Passage: Daniel went back to the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?\nAnswer:",
"def fib(n)",
# 'Here is an extract from a webpage: "Have you ever experienced heel pain after a heavy physical activity, or even right after a long period of standing? If you regard this as something usual and normal, then think again. Miscalled as heel pain, plantar fasciitis causes these frequent mild pains experienced in the soles of the feet. It is the inflammation and enlargement the plantar fascia tissue that is located in the heels of the feet, stretching to the base of the toes. This tissue is responsible for absorbing shock in the feet and for supporting the arches. It also plays a vital role in foot movements during walking and standing. Many factors such as excessive walking, standing, and running trigger heel pain and plantar fasciitis. A sudden increase in intensity of activities, increase in weight, and abrupt change of footwear also cause the swelling of the ligament. Non-supportive footwear lacking arch cushions and improper and worn out running or training can also lead to the problem. It is also most evident among those". Write an extensive and detailed course unit suitable for a textbook targeted at college students, related to the given extract, within the context of "Medicine". Do not just list concepts, but develop each one in detail before moving to the next, as we prioritize depth of understanding and comprehensive exploration of the subject matter over breadth. Focus on: - Rigor: Ensure in-depth coverage of the concepts/sections. - Engagement: Write with an academic, professional and engaging tone that captivates interest. - Application: Incorporate specific, practical examples, such as proofs in calculus or critical dates and figures in history. Do not include a title or an introduction, simply write the content without headlines and introductory phrases. Do not use images.',
# "Advancements in technology will lead to",
# "Tomorrow's world is shaped by",
]
outputs = decode_text(
input_iter=(GenerationInput(text=text) for text in dummy_inputs),
tokenizer=tokenizer,
# TODO @thomasw21: From ModelWithLoss extract the model.
model=model.model,
parallel_context=parallel_context,
max_new_tokens=args.max_new_tokens,
max_micro_batch_size=2,
generation_config=GenerationArgs(sampler="greedy", use_cache=True),
tokenizer_config=TokenizerConfig(max_input_length=None),
is_bench=os.environ.get("USE_BENCH", "0") == "1",
)
for output in outputs:
input_ids = output.input_ids
generated_ids = output.generation_ids
if isinstance(input_ids, TensorPointer):
assert isinstance(generated_ids, TensorPointer)
continue
assert isinstance(generated_ids, torch.Tensor)
log_rank(
f"input: {tokenizer.decode(input_ids, clean_up_tokenization_spaces=False)[:1000]}",
logger=logger,
level=logging.INFO,
rank=0,
)
log_rank(
f"generation: {tokenizer.decode(generated_ids[len(input_ids) :], clean_up_tokenization_spaces=False)}",
logger=logger,
level=logging.INFO,
rank=0,
)
log_rank(
"--------------------------------------------------",
logger=logger,
level=logging.INFO,
rank=0,
)
else:
outputs = decode_tokenized(
input_ids=torch.zeros(1, 1).to(dtype=torch.int64, device="cuda"),
input_mask=torch.ones(1, 1).to(dtype=torch.bool, device="cuda"),
model=model.model,
parallel_context=parallel_context,
generation_config=GenerationArgs(sampler="greedy", use_cache=True),
max_micro_batch_size=1,
max_new_tokens=12,
returns_logits=False,
)
for output in outputs:
input_ids = output.input_ids
generated_ids = output.generation_ids
if isinstance(input_ids, TensorPointer):
assert isinstance(generated_ids, TensorPointer)
continue
assert isinstance(generated_ids, torch.Tensor)
log_rank(
f"generation: {generated_ids[len(input_ids) :]}",
logger=logger,
level=logging.INFO,
rank=0,
)
log_rank(
"--------------------------------------------------",
logger=logger,
level=logging.INFO,
rank=0,
)
dist.barrier()
if __name__ == "__main__":
main()
"""
Nanotron training script.
Usage:
```
export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
torchrun --nproc_per_node=8 run_train.py --config-file examples/config_tiny_llama.yaml
```
"""
import argparse
from typing import Dict, cast
import numpy as np
from nanotron import logging
from nanotron.config import DataArgs, DatasetStageArgs, NanosetDatasetsArgs, PretrainDatasetsArgs
from nanotron.data.dataloader_builder import build_nanoset_dataloader
from nanotron.dataloader import (
clm_process,
dummy_infinite_data_generator,
get_datasets,
get_train_dataloader,
)
from nanotron.helpers import (
compute_remain_train_steps_of_a_data_stage_from_ckp,
get_consumed_train_samples_of_a_data_stage_from_ckp,
)
from nanotron.logging import log_rank
from nanotron.parallel.pipeline_parallel.utils import get_input_output_pp_ranks
from nanotron.trainer import DistributedTrainer
from nanotron.utils import main_rank_first
from torch.utils.data import DataLoader
try:
from huggingface_hub import __version__ as hf_hub_version
from transformers import AutoTokenizer
from transformers import __version__ as tf_version
except ImportError:
hf_hub_version = None
tf_version = None
logger = logging.get_logger(__name__)
def get_dataloader_from_data_stage(
trainer: DistributedTrainer,
data: DataArgs,
consumed_train_samples: int,
num_remaining_train_steps: int,
):
"""
Returns a dataloader for a given data stage.
data: The data configuration for the current stage.
consumed_train_samples: The number of samples consumed by the model in the this stage (each stage starts from zero).
num_remaining_train_steps: The number of remaining training steps for this stage.
"""
assert consumed_train_samples >= 0, "consumed_train_samples should be greater than 0"
assert num_remaining_train_steps >= 0, "num_remaining_train_steps should be greater than 0"
# First, we need to know which ranks to feed the dataloader to
input_pp_rank, output_pp_rank = get_input_output_pp_ranks(model=trainer.model)
# Case 1: Dummy data generator
if data.dataset is None:
log_rank("Using dummy data generator", logger=logger, level=logging.INFO, rank=0)
dataloader = dummy_infinite_data_generator(
micro_batch_size=trainer.micro_batch_size,
sequence_length=trainer.sequence_length,
input_pp_rank=input_pp_rank,
output_pp_rank=output_pp_rank,
vocab_size=trainer.model_config.vocab_size,
seed=data.seed,
parallel_context=trainer.parallel_context,
)()
# Case 2: HuggingFace datasets
elif isinstance(data.dataset, PretrainDatasetsArgs):
log_rank("Using `datasets` library", logger=logger, level=logging.INFO, rank=0)
tokenizer_path = trainer.config.tokenizer.tokenizer_name_or_path
log_rank(
f"Loading tokenizer from {tokenizer_path} and transformers/hf_hub versions {tf_version, hf_hub_version}",
logger=logger,
level=logging.INFO,
rank=0,
)
# We need to the 1st device to process dataset and cache it, then other devices load from cache
with main_rank_first(trainer.parallel_context.world_pg):
# TODO @nouamanetazi: this may timeout before 1st device finishes processing dataset. Can we have a ctxmanager to modify timeout?
# TODO: generalise to include for validation/test splits
# We load the raw dataset
raw_dataset = get_datasets(
hf_dataset_or_datasets=data.dataset.hf_dataset_or_datasets,
hf_dataset_config_name=data.dataset.hf_dataset_config_name,
splits=data.dataset.hf_dataset_splits,
)["train"]
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
# Check that tokenizer's vocab size is smaller than the model's vocab size
assert (
tokenizer.vocab_size <= trainer.model_config.vocab_size
), f"Tokenizer's vocab size ({tokenizer.vocab_size}) is larger than the model's vocab size ({trainer.model_config.vocab_size})"
# We apply the Causal Language Modeling preprocessing
train_dataset = clm_process(
raw_dataset=raw_dataset,
tokenizer=tokenizer,
text_column_name=data.dataset.text_column_name,
dataset_processing_num_proc_per_process=data.dataset.dataset_processing_num_proc_per_process,
dataset_overwrite_cache=data.dataset.dataset_overwrite_cache,
sequence_length=trainer.sequence_length,
)
# We load the processed dataset on the ranks requiring it
dataloader = get_train_dataloader(
train_dataset=train_dataset,
sequence_length=trainer.sequence_length,
parallel_context=trainer.parallel_context,
input_pp_rank=input_pp_rank,
output_pp_rank=output_pp_rank,
micro_batch_size=trainer.micro_batch_size,
consumed_train_samples=consumed_train_samples,
dataloader_num_workers=data.num_loading_workers,
seed_worker=data.seed,
dataloader_drop_last=True,
)
# Check if we have enough samples for train_steps
total_tokens_dataset = len(dataloader.dataset) * trainer.sequence_length
num_tokens_needed_for_training = (
num_remaining_train_steps * trainer.global_batch_size * trainer.sequence_length
)
assert num_tokens_needed_for_training <= total_tokens_dataset, (
f"Dataset is too small for steps ({total_tokens_dataset} < {num_tokens_needed_for_training}), "
f"Try train_steps<={len(dataloader.dataset) // trainer.global_batch_size + trainer.iteration_step}"
)
# Case 3: Nanosets
elif isinstance(data.dataset, NanosetDatasetsArgs):
# Get tokenizer cardinality
tokenizer = AutoTokenizer.from_pretrained(trainer.config.tokenizer.tokenizer_name_or_path)
token_size = 4 if len(tokenizer) > np.iinfo(np.uint16).max + 1 else 2
del tokenizer
# Create Nanoset
from nanotron.data.nanoset import Nanoset
with main_rank_first(trainer.parallel_context.world_pg):
train_dataset = Nanoset(
dataset_folders=data.dataset.dataset_folder,
dataset_weights=data.dataset.dataset_weights,
sequence_length=trainer.sequence_length,
token_size=token_size,
train_split_num_samples=trainer.config.tokens.train_steps * trainer.global_batch_size,
random_seed=data.seed,
)
# Prepare dataloader
train_dataloader = build_nanoset_dataloader(
train_dataset,
trainer.sequence_length,
parallel_context=trainer.parallel_context,
input_pp_rank=input_pp_rank,
output_pp_rank=output_pp_rank,
micro_batch_size=trainer.micro_batch_size,
consumed_train_samples=consumed_train_samples,
dataloader_num_workers=data.num_loading_workers,
dataloader_drop_last=True,
)
return train_dataloader
else:
raise ValueError(f"Unhandled case of `self.config.data.dataset`. Got: {data.dataset}")
return dataloader
def get_dataloader(trainer: DistributedTrainer) -> Dict[str, DataLoader]:
dataloaders = {}
for stage_idx, stage in enumerate(trainer.config.data_stages):
# NOTE: we only create the dataloader for the first stage,
# then we lazy initialize the dataloader for the other stages
stage = cast(DatasetStageArgs, stage)
consumed_train_samples = get_consumed_train_samples_of_a_data_stage_from_ckp(stage, trainer.metadata)
assert (
consumed_train_samples is not None
), f"Cannot find consumed_train_samples for stage {stage.start_training_step} in the checkpoint"
num_remaining_train_steps = compute_remain_train_steps_of_a_data_stage_from_ckp(
stage, trainer.config, trainer.metadata
)
log_rank(
f"[Training Plan] Stage {stage.name} has {num_remaining_train_steps} remaining training steps and has consumed {consumed_train_samples} samples",
logger=logger,
level=logging.INFO,
rank=0,
)
dataloader = (
get_dataloader_from_data_stage(
trainer,
stage.data,
consumed_train_samples=consumed_train_samples,
num_remaining_train_steps=num_remaining_train_steps,
)
if stage_idx == 0
else lambda stage=stage: get_dataloader_from_data_stage(
trainer,
stage.data,
consumed_train_samples=consumed_train_samples,
num_remaining_train_steps=num_remaining_train_steps,
)
)
dataloaders[stage.name] = dataloader
return dataloaders
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--config-file", type=str, required=True, help="Path to the YAML or python config file")
return parser.parse_args()
if __name__ == "__main__":
args = get_args()
config_file = args.config_file
# Load trainer and data
trainer = DistributedTrainer(config_file)
dataloader = get_dataloader(trainer)
# Train
trainer.train(dataloader)
"""Fixes the problem where '{type.value}_{suffix_name}.safetensors' was duplicated in checkpoint files
For example this script will change the following:
```
checkpoints/10/model/model/decoder/0/pp_block/attn/o_proj/model_model_weight.safetensors_pp-rank-0-of-1_tp-rank-0-of-2.safetensors
to
checkpoints/10/model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-2.safetensors
```
Example Usage:
python scripts/fix_checkpoint_bad_naming.py /fsx/nouamane/projects/nanotron/checkpoints/10
"""
import argparse
import os
import re
from pathlib import Path
def update_checkpoint(checkpoint_dir: str):
print(f"Updating checkpoint in {checkpoint_dir}")
for root, _, files in os.walk(checkpoint_dir):
for file in files:
if file.endswith(".safetensors"):
# r'(?<=model)_(model)' means match the string '_model' that is preceded by 'model'
if len(re.findall(r"(?<=model)_(model)", file)) == 0:
continue
# we remove second _model
new_file = re.sub(r"(?<=model)_(model)", "", file)
# we would have "model_weight.safetensors_pp-rank-0-of-1_tp-rank-0-of-2.safetensors"
# let's assert we have two matches of ".safetensors"
assert len(re.findall(r".safetensors", new_file)) == 2
# then we remove first match
new_file = re.sub(r".safetensors", "", new_file, count=1)
# so that we get "model_weight_pp-rank-0-of-1_tp-rank-0-of-2.safetensors"
print(f"Renaming {file} to {new_file}")
os.rename(os.path.join(root, file), os.path.join(root, new_file))
def main():
parser = argparse.ArgumentParser(description="Update checkpoint from 1.3 to 1.4")
parser.add_argument("checkpoint_dir", type=Path, help="Path to the checkpoint directory")
args = parser.parse_args()
update_checkpoint(args.checkpoint_dir)
if __name__ == "__main__":
main()
"""
This script use to log evaluation results to wandb.
python3 log_eval_results_to_wandb.py --eval-path /path/to/eval/results --wandb-project project_name --wandb-name run_name
The folder that contains the evaluation results should have the following structure:
- 5000:
results_x.json # where x is the ligheval's evaluation number
- 10000:
...
...
"""
import argparse
import json
import os
from pathlib import Path
import wandb
def run(current_path: Path):
def compute_avg_acc_of_a_benchmark(data, benchmark_prefix):
sum_acc, sum_acc_norm, sum_acc_stderr, sum_acc_norm_stderr, count = 0, 0, 0, 0, 0
for key, values in data.items():
if f"{benchmark_prefix}:" in key:
sum_acc += values["acc"]
sum_acc_norm += values["acc_norm"]
sum_acc_stderr += values["acc_stderr"]
sum_acc_norm_stderr += values["acc_norm_stderr"]
count += 1
average_acc = sum_acc / count if count else 0
return average_acc
def compute_avg_acc_of_all_tasks(data):
sum_acc, count = 0, 0
for _, values in data.items():
sum_acc += values["acc"]
count += 1
average_acc = sum_acc / count if count else 0
return average_acc
list_checkpoints = os.listdir(current_path)
sorted_list_checkpoints = sorted(list_checkpoints, key=int)
for item in sorted_list_checkpoints:
item_path = os.path.join(current_path, item)
if os.path.isdir(item_path):
json_files = [f for f in os.listdir(item_path) if f.endswith(".json")]
if len(json_files) == 1:
json_file_path = os.path.join(item_path, json_files[0])
with open(json_file_path, "r") as file:
eval_data = json.load(file)
iteration_step = eval_data["config_general"]["config"]["general"]["step"]
consumed_train_samples = eval_data["config_general"]["config"]["general"]["consumed_train_samples"]
logging_results = {}
for name, data in eval_data["results"].items():
logging_results[f"{name}_acc"] = data["acc"]
logging_results["mmlu:average_acc"] = compute_avg_acc_of_a_benchmark(eval_data["results"], "mmlu")
logging_results["arc:average_acc"] = compute_avg_acc_of_a_benchmark(eval_data["results"], "arc")
logging_results["all:average_acc"] = compute_avg_acc_of_all_tasks(eval_data["results"])
wandb.log(
{
**logging_results,
"iteration_step": iteration_step,
"consumed_train_samples": consumed_train_samples,
}
)
elif len(json_files) > 1:
print(f"More than one JSON file found in {item_path}. Skipping.")
else:
print(f"No JSON file found in {item_path}.")
print(f"Checkpoint {item} is done. /n")
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--eval-path", type=str, required=True, help="Path of the lighteval's evaluation results")
parser.add_argument(
"--wandb-project", type=str, help="Path of the lighteval's evaluation results", default="nanotron_evals"
)
parser.add_argument(
"--wandb-name",
type=str,
required=True,
help="Path of the lighteval's evaluation results",
default="sanity_evals",
)
return parser.parse_args()
if __name__ == "__main__":
args = get_args()
eval_path = args.eval_path
wandb_project = args.wandb_project
wandb_name = args.wandb_name
wandb.init(
project=wandb_project,
name=wandb_name,
config={"eval_path": eval_path},
)
run(eval_path)
# Pre-training
We use [nanotron](https://github.com/huggingface/nanotron/) library for training SmolLM and SmolLM2 base models.
The scripts for training SmolLM v1 can be found in the `smollm1` folder. SmolLM2 has a similar architecture and setup but uses an improved data mixture that we curated and significantly longer training periods (11 trillion tokens for the 1.7B, 4 trillion for the 360M and 2 trillion for the 135M). We will upload the SmolLM2 configs soon.
## Setup
Please refer to [nanotron](https://github.com/huggingface/nanotron/) for detailed instructions on setting up your training environment and launching jobs.
After setting up the environment and tokenizing the training datasets with [datatrove](https://github.com/huggingface/datatrove) (instructions available [here](https://github.com/huggingface/nanotron/blob/main/docs/nanoset.md#nanosets)), you can modify the configurations to match your number of nodes and local paths.
Below is an example of launching SmolLM1 135M training on 1 node (you can change the DP value to 8 in the config and adjust the batch size) and run:
```bash
git clone https://github.com/huggingface/nanotron
cd nanotron
# follow installation
CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=8 run_train.py --config-file smollm1/config_smollm1_135M.yaml
```
If you are working on a slurm cluster, you can modify the `launch.slurm` and launch the training with:
```bash
sbatch launch.slurm
```
> [!NOTE]
> Don't forget to create the logs directory before launching the job:
# SmolLM1 135M trained on 600B tokens
checkpoints:
checkpoint_interval: 2000
checkpoints_path: checkpoints
checkpoints_path_is_shared_file_system: false
resume_checkpoint_path: null
save_final_state: false
save_initial_state: false
data_stages:
- data:
dataset:
dataset_folder: # paths to tokenized datasets
- datasets/fineweb-edu-dedup
- datasets/cosmopedia-v2
- datasets/python-edu
- datasets/open-web-math
- datasets/stackoverflow
dataset_weights:
- 0.7
- 0.15
- 0.08
- 0.06
- 0.01
num_loading_workers: 1
seed: 42
name: training stage
start_training_step: 1
general:
benchmark_csv_path: null
consumed_train_samples: null
ignore_sanity_checks: true
project: smollm
run: smollm-135M
seed: 8
step: null
logging:
iteration_step_info_interval: 1
log_level: info
log_level_replica: info
model:
ddp_bucket_cap_mb: 25
dtype: bfloat16
init_method:
std: 0.0416 # 1/sqrt(hidden_size)
make_vocab_size_divisible_by: 1
model_config:
bos_token_id: 0
eos_token_id: 0
hidden_act: silu
hidden_size: 576
initializer_range: 0.02
intermediate_size: 1536
is_llama_config: true
max_position_embeddings: 2048
num_attention_heads: 9
num_hidden_layers: 30
num_key_value_heads: 3
pad_token_id: null
pretraining_tp: 1
rms_norm_eps: 1.0e-05
rope_scaling: null
rope_theta: 10000.0
tie_word_embeddings: true
use_cache: true
vocab_size: 49152
optimizer:
accumulate_grad_in_fp32: true
clip_grad: 1.0
learning_rate_scheduler:
learning_rate: 0.003
lr_decay_starting_step: 250000
lr_decay_steps: 50000
lr_decay_style: 1-sqrt
lr_warmup_steps: 2500
lr_warmup_style: linear
min_decay_lr: 0
optimizer_factory:
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.0e-08
name: adamW
torch_adam_is_fused: true
weight_decay: 0.01
zero_stage: 0
parallelism:
dp: 32 # 4 nodes
expert_parallel_size: 1
pp: 1
pp_engine: 1f1b
recompute_layer: false
tp: 1
tp_linear_async_communication: true
tp_mode: REDUCE_SCATTER
tp_recompute_allgather: true
profiler: null
tokenizer:
tokenizer_max_length: null
tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
tokenizer_revision: null
tokens:
batch_accumulation_per_replica: 2
limit_test_batches: 0
limit_val_batches: 0
micro_batch_size: 8 # GBS = 8*2*32*sequence_length = 512*sequence_length = 1M tokens
sequence_length: 2048
train_steps: 600000
val_check_interval: -1
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment