"...lm-evaluation-harness.git" did not exist on "cef6aa8d76e0ac91352ce88283db198ba45b3de7"
Commit d99506f3 authored by chenzk's avatar chenzk
Browse files

v1.0.1

parent 61e92904
Pipeline #2033 canceled with stages
"""
This script use to log evaluation results to wandb.
python3 log_eval_results_to_wandb.py --eval-path /path/to/eval/results --wandb-project project_name --wandb-name run_name
The folder that contains the evaluation results should have the following structure:
- 5000:
results_x.json # where x is the ligheval's evaluation number
- 10000:
...
...
"""
import argparse
import json
import os
from pathlib import Path
import wandb
def run(current_path: Path):
def compute_avg_acc_of_a_benchmark(data, benchmark_prefix):
sum_acc, sum_acc_norm, sum_acc_stderr, sum_acc_norm_stderr, count = 0, 0, 0, 0, 0
for key, values in data.items():
if f"{benchmark_prefix}:" in key:
sum_acc += values["acc"]
sum_acc_norm += values["acc_norm"]
sum_acc_stderr += values["acc_stderr"]
sum_acc_norm_stderr += values["acc_norm_stderr"]
count += 1
average_acc = sum_acc / count if count else 0
return average_acc
def compute_avg_acc_of_all_tasks(data):
sum_acc, count = 0, 0
for _, values in data.items():
sum_acc += values["acc"]
count += 1
average_acc = sum_acc / count if count else 0
return average_acc
list_checkpoints = os.listdir(current_path)
sorted_list_checkpoints = sorted(list_checkpoints, key=int)
for item in sorted_list_checkpoints:
item_path = os.path.join(current_path, item)
if os.path.isdir(item_path):
json_files = [f for f in os.listdir(item_path) if f.endswith(".json")]
if len(json_files) == 1:
json_file_path = os.path.join(item_path, json_files[0])
with open(json_file_path, "r") as file:
eval_data = json.load(file)
iteration_step = eval_data["config_general"]["config"]["general"]["step"]
consumed_train_samples = eval_data["config_general"]["config"]["general"]["consumed_train_samples"]
logging_results = {}
for name, data in eval_data["results"].items():
logging_results[f"{name}_acc"] = data["acc"]
logging_results["mmlu:average_acc"] = compute_avg_acc_of_a_benchmark(eval_data["results"], "mmlu")
logging_results["arc:average_acc"] = compute_avg_acc_of_a_benchmark(eval_data["results"], "arc")
logging_results["all:average_acc"] = compute_avg_acc_of_all_tasks(eval_data["results"])
wandb.log(
{
**logging_results,
"iteration_step": iteration_step,
"consumed_train_samples": consumed_train_samples,
}
)
elif len(json_files) > 1:
print(f"More than one JSON file found in {item_path}. Skipping.")
else:
print(f"No JSON file found in {item_path}.")
print(f"Checkpoint {item} is done. /n")
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--eval-path", type=str, required=True, help="Path of the lighteval's evaluation results")
parser.add_argument(
"--wandb-project", type=str, help="Path of the lighteval's evaluation results", default="nanotron_evals"
)
parser.add_argument(
"--wandb-name",
type=str,
required=True,
help="Path of the lighteval's evaluation results",
default="sanity_evals",
)
return parser.parse_args()
if __name__ == "__main__":
args = get_args()
eval_path = args.eval_path
wandb_project = args.wandb_project
wandb_name = args.wandb_name
wandb.init(
project=wandb_project,
name=wandb_name,
config={"eval_path": eval_path},
)
run(eval_path)
# Pre-training
We use [nanotron](https://github.com/huggingface/nanotron/) library for training SmolLM and SmolLM2 base models.
The scripts for training SmolLM v1 can be found in the `smollm1` folder. SmolLM2 has a similar architecture and setup but uses an improved data mixture that we curated and significantly longer training periods (11 trillion tokens for the 1.7B, 4 trillion for the 360M and 2 trillion for the 135M). We will upload the SmolLM2 configs soon.
## Setup
Please refer to [nanotron](https://github.com/huggingface/nanotron/) for detailed instructions on setting up your training environment and launching jobs.
After setting up the environment and tokenizing the training datasets with [datatrove](https://github.com/huggingface/datatrove) (instructions available [here](https://github.com/huggingface/nanotron/blob/main/docs/nanoset.md#nanosets)), you can modify the configurations to match your number of nodes and local paths.
Below is an example of launching SmolLM1 135M training on 1 node (you can change the DP value to 8 in the config and adjust the batch size) and run:
```bash
git clone https://github.com/huggingface/nanotron
cd nanotron
# follow installation
CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=8 run_train.py --config-file smollm1/config_smollm1_135M.yaml
```
If you are working on a slurm cluster, you can modify the `launch.slurm` and launch the training with:
```bash
sbatch launch.slurm
```
> [!NOTE]
> Don't forget to create the logs directory before launching the job:
# SmolLM1 135M trained on 600B tokens
checkpoints:
checkpoint_interval: 2000
checkpoints_path: checkpoints
checkpoints_path_is_shared_file_system: false
resume_checkpoint_path: null
save_final_state: false
save_initial_state: false
data_stages:
- data:
dataset:
dataset_folder: # paths to tokenized datasets
- datasets/fineweb-edu-dedup
- datasets/cosmopedia-v2
- datasets/python-edu
- datasets/open-web-math
- datasets/stackoverflow
dataset_weights:
- 0.7
- 0.15
- 0.08
- 0.06
- 0.01
num_loading_workers: 1
seed: 42
name: training stage
start_training_step: 1
general:
benchmark_csv_path: null
consumed_train_samples: null
ignore_sanity_checks: true
project: smollm
run: smollm-135M
seed: 8
step: null
logging:
iteration_step_info_interval: 1
log_level: info
log_level_replica: info
model:
ddp_bucket_cap_mb: 25
dtype: bfloat16
init_method:
std: 0.0416 # 1/sqrt(hidden_size)
make_vocab_size_divisible_by: 1
model_config:
bos_token_id: 0
eos_token_id: 0
hidden_act: silu
hidden_size: 576
initializer_range: 0.02
intermediate_size: 1536
is_llama_config: true
max_position_embeddings: 2048
num_attention_heads: 9
num_hidden_layers: 30
num_key_value_heads: 3
pad_token_id: null
pretraining_tp: 1
rms_norm_eps: 1.0e-05
rope_scaling: null
rope_theta: 10000.0
tie_word_embeddings: true
use_cache: true
vocab_size: 49152
optimizer:
accumulate_grad_in_fp32: true
clip_grad: 1.0
learning_rate_scheduler:
learning_rate: 0.003
lr_decay_starting_step: 250000
lr_decay_steps: 50000
lr_decay_style: 1-sqrt
lr_warmup_steps: 2500
lr_warmup_style: linear
min_decay_lr: 0
optimizer_factory:
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.0e-08
name: adamW
torch_adam_is_fused: true
weight_decay: 0.01
zero_stage: 0
parallelism:
dp: 32 # 4 nodes
expert_parallel_size: 1
pp: 1
pp_engine: 1f1b
recompute_layer: false
tp: 1
tp_linear_async_communication: true
tp_mode: REDUCE_SCATTER
tp_recompute_allgather: true
profiler: null
tokenizer:
tokenizer_max_length: null
tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
tokenizer_revision: null
tokens:
batch_accumulation_per_replica: 2
limit_test_batches: 0
limit_val_batches: 0
micro_batch_size: 8 # GBS = 8*2*32*sequence_length = 512*sequence_length = 1M tokens
sequence_length: 2048
train_steps: 600000
val_check_interval: -1
\ No newline at end of file
# SmolLM1 135M trained on 600B tokens
checkpoints:
checkpoint_interval: 2000
checkpoints_path: checkpoints
checkpoints_path_is_shared_file_system: false
resume_checkpoint_path: null
save_final_state: false
save_initial_state: false
data_stages:
- data:
dataset:
dataset_folder: # paths to tokenized datasets
- datasets/fineweb-edu-dedup-ds
- datasets/fineweb-edu-dedup-ds
- datasets/fineweb-edu-dedup-ds
- datasets/fineweb-edu-dedup-ds
- datasets/fineweb-edu-dedup-ds
dataset_weights:
- 0.7
- 0.15
- 0.08
- 0.06
- 0.01
num_loading_workers: 1
seed: 42
name: training stage
start_training_step: 1
general:
benchmark_csv_path: null
consumed_train_samples: null
ignore_sanity_checks: true
project: smollm
run: smollm-135M
seed: 8
step: null
logging:
iteration_step_info_interval: 1
log_level: info
log_level_replica: info
model:
ddp_bucket_cap_mb: 25
dtype: bfloat16
init_method:
std: 0.0416 # 1/sqrt(hidden_size)
make_vocab_size_divisible_by: 1
model_config:
bos_token_id: 0
eos_token_id: 0
hidden_act: silu
hidden_size: 576
initializer_range: 0.02
intermediate_size: 1536
is_llama_config: true
max_position_embeddings: 2048
num_attention_heads: 9
num_hidden_layers: 30
num_key_value_heads: 3
pad_token_id: null
pretraining_tp: 1
rms_norm_eps: 1.0e-05
rope_scaling: null
rope_theta: 10000.0
tie_word_embeddings: true
use_cache: true
vocab_size: 49152
optimizer:
accumulate_grad_in_fp32: true
clip_grad: 1.0
learning_rate_scheduler:
learning_rate: 0.003
lr_decay_starting_step: 250000
lr_decay_steps: 50000
lr_decay_style: 1-sqrt
lr_warmup_steps: 2500
lr_warmup_style: linear
min_decay_lr: 0
optimizer_factory:
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.0e-08
name: adamW
torch_adam_is_fused: true
weight_decay: 0.01
zero_stage: 0
parallelism:
dp: 1 # 4 nodes
expert_parallel_size: 1
pp: 1
pp_engine: 1f1b
recompute_layer: false
tp: 1
tp_linear_async_communication: true
tp_mode: REDUCE_SCATTER
tp_recompute_allgather: true
profiler: null
tokenizer:
tokenizer_max_length: null
tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
tokenizer_revision: null
tokens:
batch_accumulation_per_replica: 2
limit_test_batches: 0
limit_val_batches: 0
micro_batch_size: 8 # GBS = 8*2*32*sequence_length = 512*sequence_length = 1M tokens
sequence_length: 2048
train_steps: 2000
val_check_interval: -1
# SmolLM1 135M trained on 600B tokens
checkpoints:
checkpoint_interval: 2000
checkpoints_path: checkpoints
checkpoints_path_is_shared_file_system: false
resume_checkpoint_path: null
save_final_state: false
save_initial_state: false
data_stages:
- data:
dataset:
dataset_overwrite_cache: false
dataset_processing_num_proc_per_process: 1
hf_dataset_config_name: null
hf_dataset_or_datasets: datasets/fineweb-edu-dedup # paths to tokenized datasets
hf_dataset_splits: train
text_column_name: text
num_loading_workers: 1
seed: 42
name: training stage
start_training_step: 1
general:
benchmark_csv_path: null
consumed_train_samples: null
ignore_sanity_checks: true
project: smollm
run: smollm-135M
seed: 8
step: null
logging:
iteration_step_info_interval: 1
log_level: info
log_level_replica: info
model:
ddp_bucket_cap_mb: 25
dtype: bfloat16
init_method:
std: 0.0416 # 1/sqrt(hidden_size)
make_vocab_size_divisible_by: 1
model_config:
bos_token_id: 0
eos_token_id: 0
hidden_act: silu
hidden_size: 576
initializer_range: 0.02
intermediate_size: 1536
is_llama_config: true
max_position_embeddings: 2048
num_attention_heads: 9
num_hidden_layers: 30
num_key_value_heads: 3
pad_token_id: null
pretraining_tp: 1
rms_norm_eps: 1.0e-05
rope_scaling: null
rope_theta: 10000.0
tie_word_embeddings: true
use_cache: true
vocab_size: 49152
optimizer:
accumulate_grad_in_fp32: true
clip_grad: 1.0
learning_rate_scheduler:
learning_rate: 0.003
lr_decay_starting_step: 250000
lr_decay_steps: 50000
lr_decay_style: 1-sqrt
lr_warmup_steps: 2500
lr_warmup_style: linear
min_decay_lr: 0
optimizer_factory:
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.0e-08
name: adamW
torch_adam_is_fused: true
weight_decay: 0.01
zero_stage: 0
parallelism:
dp: 1 # 4 nodes
expert_parallel_size: 1
pp: 1
pp_engine: 1f1b
recompute_layer: false
tp: 1
tp_linear_async_communication: true
tp_mode: REDUCE_SCATTER
tp_recompute_allgather: true
profiler: null
tokenizer:
tokenizer_max_length: null
tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
tokenizer_revision: null
tokens:
batch_accumulation_per_replica: 2
limit_test_batches: 0
limit_val_batches: 0
micro_batch_size: 8 # GBS = 8*2*32*sequence_length = 512*sequence_length = 1M tokens
sequence_length: 2048
train_steps: 2000
val_check_interval: -1
# SmolLM1 1.7B trained on 1T tokens
checkpoints:
checkpoint_interval: 2000
checkpoints_path: checkpoints
checkpoints_path_is_shared_file_system: false
resume_checkpoint_path: null
save_final_state: false
save_initial_state: false
data_stages:
- data:
dataset:
dataset_folder: # paths to tokenized datasets
- datasets/fineweb-edu-dedup
- datasets/cosmopedia-v2
- datasets/open-web-math
- datasets/starcoderdata-python
- datasets/stackoverflow
dataset_weights:
- 0.7
- 0.15
- 0.06
- 0.08
- 0.01
num_loading_workers: 1
seed: 42
name: training stage
start_training_step: 1
- data:
dataset: # we change data mixture to use python-edu
dataset_folder:
- datasets/fineweb-edu-dedup
- datasets/cosmopedia-v2
- datasets/open-web-math
- datasets/python-edu
- datasets/stackoverflow
- datasets/deepmind_mathematics
dataset_weights:
- 0.7
- 0.15
- 0.055
- 0.08
- 0.01
- 0.005
num_loading_workers: 1
seed: 42
name: training stage 2
start_training_step: 300000
general:
benchmark_csv_path: null
consumed_train_samples: null
ignore_sanity_checks: true
project: smollm
run: smollm-1700M
seed: 8
step: null
logging:
iteration_step_info_interval: 1
log_level: info
log_level_replica: info
model:
ddp_bucket_cap_mb: 25
dtype: bfloat16
init_method:
std: 0.022097086912079608
make_vocab_size_divisible_by: 1
model_config:
bos_token_id: 0
eos_token_id: 0
hidden_act: silu
hidden_size: 2048
initializer_range: 0.02
intermediate_size: 8192
is_llama_config: true
max_position_embeddings: 2048
num_attention_heads: 32
num_hidden_layers: 24
num_key_value_heads: 32
pad_token_id: null
pretraining_tp: 1
rms_norm_eps: 1.0e-05
rope_scaling: null
rope_theta: 10000.0
tie_word_embeddings: true
use_cache: true
vocab_size: 49152
optimizer:
accumulate_grad_in_fp32: true
clip_grad: 1.0
learning_rate_scheduler:
learning_rate: 0.0005
lr_decay_starting_step: 400000
lr_decay_steps: 100000
lr_decay_style: 1-sqrt
lr_warmup_steps: 2000
lr_warmup_style: linear
min_decay_lr: 0
optimizer_factory:
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.0e-08
name: adamW
torch_adam_is_fused: true
weight_decay: 0.01
zero_stage: 0
parallelism:
dp: 64 # 8 nodes
expert_parallel_size: 1
pp: 1
pp_engine: 1f1b
recompute_layer: false
tp: 1
tp_linear_async_communication: true
tp_mode: REDUCE_SCATTER
tp_recompute_allgather: true
profiler: null
tokenizer:
tokenizer_max_length: null
tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
tokenizer_revision: null
tokens:
batch_accumulation_per_replica: 4
limit_test_batches: 0
limit_val_batches: 0
micro_batch_size: 4 # GBS = 4*4*64*sequence_length = 1024*sequence_length = 2.1M tokens
sequence_length: 2048
train_steps: 500000
val_check_interval: -1
\ No newline at end of file
# SmolLM1 360M trained on 600B tokens
checkpoints:
checkpoint_interval: 2000
checkpoints_path: checkpoints
checkpoints_path_is_shared_file_system: false
resume_checkpoint_path: null
save_final_state: false
save_initial_state: false
data_stages:
- data:
dataset:
dataset_folder: # paths to tokenized datasets
- datasets/fineweb-edu-dedup
- datasets/cosmopedia-v2
- datasets/python-edu
- datasets/open-web-math
- datasets/stackoverflow
dataset_weights:
- 0.7
- 0.15
- 0.08
- 0.06
- 0.01
num_loading_workers: 1
seed: 42
name: training stage
start_training_step: 1
general:
benchmark_csv_path: null
consumed_train_samples: null
ignore_sanity_checks: true
project: smollm
run: smollm-360M
seed: 8
step: null
logging:
iteration_step_info_interval: 1
log_level: info
log_level_replica: info
model:
ddp_bucket_cap_mb: 25
dtype: bfloat16
init_method:
std: 0.03227486121839514
make_vocab_size_divisible_by: 1
model_config:
bos_token_id: 0
eos_token_id: 0
hidden_act: silu
hidden_size: 960
initializer_range: 0.02
intermediate_size: 2560
is_llama_config: true
max_position_embeddings: 2048
num_attention_heads: 15
num_hidden_layers: 32
num_key_value_heads: 5
pad_token_id: null
pretraining_tp: 1
rms_norm_eps: 1.0e-05
rope_scaling: null
rope_theta: 10000.0
tie_word_embeddings: true
use_cache: true
vocab_size: 49152
optimizer:
accumulate_grad_in_fp32: true
clip_grad: 1.0
learning_rate_scheduler:
learning_rate: 0.003
lr_decay_starting_step: 500000
lr_decay_steps: 100000
lr_decay_style: 1-sqrt
lr_warmup_steps: 5000
lr_warmup_style: linear
min_decay_lr: 0
optimizer_factory:
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.0e-08
name: adamW
torch_adam_is_fused: true
weight_decay: 0.01
zero_stage: 0
parallelism:
dp: 32
expert_parallel_size: 1
pp: 1
pp_engine: 1f1b
recompute_layer: false
tp: 1
tp_linear_async_communication: true
tp_mode: REDUCE_SCATTER
tp_recompute_allgather: true
profiler: null
tokenizer:
tokenizer_max_length: null
tokenizer_name_or_path: HuggingFaceTB/cosmo2-tokenizer
tokenizer_revision: null
tokens:
batch_accumulation_per_replica: 2
limit_test_batches: 0
limit_val_batches: 0
micro_batch_size: 8
sequence_length: 2048
train_steps: 600000
val_check_interval: -1
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment