Commit d99506f3 authored by chenzk's avatar chenzk
Browse files

v1.0.1

parent 61e92904
Pipeline #2033 canceled with stages
[project]
name = "nanotron"
version = "0.4"
description = "Minimalistic Large Language Model Training and Finetuning"
authors = [
{name = "Nouamane Tazi", email="nouamane@huggingface.co"},
{name = "Thomas Wang", email="thomas.wang@huggingface.co"},
{name = "Kunhao Zheng", email="kunhao@huggingface.co"},
{name = "Thomas Wolf", email="thomas@huggingface.co"},
]
readme = "README.md"
requires-python = "~=3.10"
classifiers = [
"Topic :: Software Development"
]
dependencies = [
"torch>=1.13.1",
"pyyaml",
"numpy",
"packaging",
"safetensors",
"dacite",
"tqdm",
"datasets",
]
[tool.setuptools.packages.find]
where = ["src"] # list of folders that contain the packages (["."] by default)
[tool.ruff]
line-length = 119
ignore = ["C901","E501"] # E501 ignores length violation handled by black
select = ["C","E","F","I","W"]
ignore-init-module-imports = true
[project.optional-dependencies]
dev = [
"pre-commit",
"pylint"
]
test = [
"pytest",
"pytest-xdist"
]
fast-modeling = [
"flash-attn>=2.5.0",
]
nanosets = [
"transformers",
"datatrove[io,processing]@git+https://github.com/huggingface/datatrove",
"numba",
]
s3 = [
"boto3",
"s3fs",
"s5cmd",
]
[build-system]
requires = [
"setuptools",
]
[pytest]
norecursedirs="tests/helpers"
dummy-tokenizer-wordlevel @ ae57c419
Subproject commit ae57c419a98ae4ddf991c6a1af4a8ce94745f45c
"""
Nanotron Inference Script
Usage:
```
export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
torchrun --nproc_per_node=4 run_generate.py ---ckpt-path checkpoints/test/4
```
"""
import argparse
import os
from pathlib import Path
import torch
from nanotron import distributed as dist
from nanotron import logging
from nanotron.config import (
GenerationArgs,
LoggingArgs,
ParallelismArgs,
get_config_from_file,
)
from nanotron.generation.decode import (
GenerationInput,
TokenizerConfig,
decode_text,
decode_tokenized,
)
from nanotron.logging import log_rank, set_ranks_logging_level
from nanotron.models import build_model
from nanotron.parallel import ParallelContext
from nanotron.parallel.parameters import sanity_check
from nanotron.parallel.pipeline_parallel.engine import (
OneForwardOneBackwardPipelineEngine,
)
from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
from nanotron.parallel.tensor_parallel.enum import TensorParallelLinearMode
from nanotron.random import (
RandomStates,
get_current_random_state,
get_synced_random_state,
set_random_seed,
)
from nanotron.serialize import load_weights
from nanotron.trainer import CONFIG_TO_MODEL_CLASS, mark_tied_parameters
try:
from transformers import AutoTokenizer
except ImportError:
AutoTokenizer = None
logger = logging.get_logger(__name__)
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--ckpt-path", type=Path, required=True, help="Checkpoint path")
parser.add_argument("--dp", type=int, default=1)
parser.add_argument("--pp", type=int, default=0)
parser.add_argument("--tp", type=int, default=0)
parser.add_argument("--max-new-tokens", type=int, default=128, help="Maximum number of new tokens to generate")
return parser.parse_args()
def main():
args = get_args()
assert args.ckpt_path.exists(), f"Checkpoint path {args.ckpt_path} does not exist"
config = get_config_from_file((args.ckpt_path / "config.yaml").as_posix())
model_config = config.model.model_config
tokenizer_path = config.tokenizer.tokenizer_name_or_path
parallel_config = ParallelismArgs(
dp=args.dp or config.parallelism.dp,
pp=args.pp or config.parallelism.pp,
tp=args.tp or config.parallelism.tp,
pp_engine=OneForwardOneBackwardPipelineEngine(),
tp_mode=TensorParallelLinearMode.ALL_REDUCE,
tp_linear_async_communication=False,
)
# Initialise all process groups
parallel_context = ParallelContext(
data_parallel_size=parallel_config.dp,
pipeline_parallel_size=parallel_config.pp,
tensor_parallel_size=parallel_config.tp,
)
# Set log levels
logging_config = LoggingArgs(
log_level="info",
log_level_replica="info",
)
# Set log levels
set_ranks_logging_level(parallel_context=parallel_context, logging_config=logging_config)
log_rank(f"model_config: {model_config}", logger=logger, level=logging.INFO, rank=0)
log_rank(f"tokenizer_path: {tokenizer_path}", logger=logger, level=logging.INFO, rank=0)
dtype = torch.bfloat16
# Set random states
set_random_seed(42)
model_config_cls = model_config.__class__.__name__
if model_config_cls not in CONFIG_TO_MODEL_CLASS:
raise ValueError(
f"Unsupported model config {model_config_cls}. Only {CONFIG_TO_MODEL_CLASS.keys()} are supported"
)
# Get synchronized random states
if parallel_config.tp_mode is TensorParallelLinearMode.ALL_REDUCE:
random_states = RandomStates(
{"tp_synced": get_synced_random_state(random_state=get_current_random_state(), pg=parallel_context.tp_pg)}
)
else:
# We don't need to sync across TP when using sequence parallel (REDUCE_SCATTER)
random_states = RandomStates({})
model = build_model(
model_builder=lambda: CONFIG_TO_MODEL_CLASS[model_config_cls](
config=model_config,
parallel_context=parallel_context,
parallel_config=parallel_config,
random_states=random_states,
),
dtype=dtype,
parallel_context=parallel_context,
)
# Mark some parameters as tied
# TODO @nouamane: this is only needed for training, can we just mark params as NanotronParameter instead?
mark_tied_parameters(model=model, parallel_context=parallel_context, parallel_config=parallel_config)
# Sanity check model
sanity_check(root_module=model)
# Load checkpoint
checkpoint_path = args.ckpt_path
log_rank(
f"Loading checkpoint from {checkpoint_path}:",
logger=logger,
level=logging.INFO,
rank=0,
)
load_weights(model=model, parallel_context=parallel_context, root_folder=checkpoint_path)
model.eval()
if AutoTokenizer is not None:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
# tokenizer.pad_token_id = tokenizer.eos_token_id
if tokenizer.pad_token_id is None:
if tokenizer.eos_token_id is not None:
tokenizer.pad_token_id = tokenizer.eos_token_id
elif getattr(model.config, "pad_token_id", None) is not None:
tokenizer.pad_token_id = int(model.config.pad_token_id)
elif getattr(model.config, "eos_token_id", None) is not None:
tokenizer.pad_token_id = int(model.config.eos_token_id)
else:
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
tokenizer.padding_side = "left"
tokenizer.truncation_side = "left" # TODO @nouamane: do we want this?
dummy_inputs = [
"The future of AI is",
# "Passage: Daniel went back to the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?\nAnswer:",
"def fib(n)",
# 'Here is an extract from a webpage: "Have you ever experienced heel pain after a heavy physical activity, or even right after a long period of standing? If you regard this as something usual and normal, then think again. Miscalled as heel pain, plantar fasciitis causes these frequent mild pains experienced in the soles of the feet. It is the inflammation and enlargement the plantar fascia tissue that is located in the heels of the feet, stretching to the base of the toes. This tissue is responsible for absorbing shock in the feet and for supporting the arches. It also plays a vital role in foot movements during walking and standing. Many factors such as excessive walking, standing, and running trigger heel pain and plantar fasciitis. A sudden increase in intensity of activities, increase in weight, and abrupt change of footwear also cause the swelling of the ligament. Non-supportive footwear lacking arch cushions and improper and worn out running or training can also lead to the problem. It is also most evident among those". Write an extensive and detailed course unit suitable for a textbook targeted at college students, related to the given extract, within the context of "Medicine". Do not just list concepts, but develop each one in detail before moving to the next, as we prioritize depth of understanding and comprehensive exploration of the subject matter over breadth. Focus on: - Rigor: Ensure in-depth coverage of the concepts/sections. - Engagement: Write with an academic, professional and engaging tone that captivates interest. - Application: Incorporate specific, practical examples, such as proofs in calculus or critical dates and figures in history. Do not include a title or an introduction, simply write the content without headlines and introductory phrases. Do not use images.',
# "Advancements in technology will lead to",
# "Tomorrow's world is shaped by",
]
outputs = decode_text(
input_iter=(GenerationInput(text=text) for text in dummy_inputs),
tokenizer=tokenizer,
# TODO @thomasw21: From ModelWithLoss extract the model.
model=model.model,
parallel_context=parallel_context,
max_new_tokens=args.max_new_tokens,
max_micro_batch_size=2,
generation_config=GenerationArgs(sampler="greedy", use_cache=True),
tokenizer_config=TokenizerConfig(max_input_length=None),
is_bench=os.environ.get("USE_BENCH", "0") == "1",
)
for output in outputs:
input_ids = output.input_ids
generated_ids = output.generation_ids
if isinstance(input_ids, TensorPointer):
assert isinstance(generated_ids, TensorPointer)
continue
assert isinstance(generated_ids, torch.Tensor)
log_rank(
f"input: {tokenizer.decode(input_ids, clean_up_tokenization_spaces=False)[:1000]}",
logger=logger,
level=logging.INFO,
rank=0,
)
log_rank(
f"generation: {tokenizer.decode(generated_ids[len(input_ids) :], clean_up_tokenization_spaces=False)}",
logger=logger,
level=logging.INFO,
rank=0,
)
log_rank(
"--------------------------------------------------",
logger=logger,
level=logging.INFO,
rank=0,
)
else:
outputs = decode_tokenized(
input_ids=torch.zeros(1, 1).to(dtype=torch.int64, device="cuda"),
input_mask=torch.ones(1, 1).to(dtype=torch.bool, device="cuda"),
model=model.model,
parallel_context=parallel_context,
generation_config=GenerationArgs(sampler="greedy", use_cache=True),
max_micro_batch_size=1,
max_new_tokens=12,
returns_logits=False,
)
for output in outputs:
input_ids = output.input_ids
generated_ids = output.generation_ids
if isinstance(input_ids, TensorPointer):
assert isinstance(generated_ids, TensorPointer)
continue
assert isinstance(generated_ids, torch.Tensor)
log_rank(
f"generation: {generated_ids[len(input_ids) :]}",
logger=logger,
level=logging.INFO,
rank=0,
)
log_rank(
"--------------------------------------------------",
logger=logger,
level=logging.INFO,
rank=0,
)
dist.barrier()
if __name__ == "__main__":
main()
"""
Nanotron training script.
Usage:
```
export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
torchrun --nproc_per_node=8 run_train.py --config-file examples/config_tiny_llama.yaml
```
"""
import argparse
from typing import Dict, cast
import numpy as np
from nanotron import logging
from nanotron.config import DataArgs, DatasetStageArgs, NanosetDatasetsArgs, PretrainDatasetsArgs
from nanotron.data.dataloader_builder import build_nanoset_dataloader
from nanotron.dataloader import (
clm_process,
dummy_infinite_data_generator,
get_datasets,
get_train_dataloader,
)
from nanotron.helpers import (
compute_remain_train_steps_of_a_data_stage_from_ckp,
get_consumed_train_samples_of_a_data_stage_from_ckp,
)
from nanotron.logging import log_rank
from nanotron.parallel.pipeline_parallel.utils import get_input_output_pp_ranks
from nanotron.trainer import DistributedTrainer
from nanotron.utils import main_rank_first
from torch.utils.data import DataLoader
try:
from huggingface_hub import __version__ as hf_hub_version
from transformers import AutoTokenizer
from transformers import __version__ as tf_version
except ImportError:
hf_hub_version = None
tf_version = None
logger = logging.get_logger(__name__)
def get_dataloader_from_data_stage(
trainer: DistributedTrainer,
data: DataArgs,
consumed_train_samples: int,
num_remaining_train_steps: int,
):
"""
Returns a dataloader for a given data stage.
data: The data configuration for the current stage.
consumed_train_samples: The number of samples consumed by the model in the this stage (each stage starts from zero).
num_remaining_train_steps: The number of remaining training steps for this stage.
"""
assert consumed_train_samples >= 0, "consumed_train_samples should be greater than 0"
assert num_remaining_train_steps >= 0, "num_remaining_train_steps should be greater than 0"
# First, we need to know which ranks to feed the dataloader to
input_pp_rank, output_pp_rank = get_input_output_pp_ranks(model=trainer.model)
# Case 1: Dummy data generator
if data.dataset is None:
log_rank("Using dummy data generator", logger=logger, level=logging.INFO, rank=0)
dataloader = dummy_infinite_data_generator(
micro_batch_size=trainer.micro_batch_size,
sequence_length=trainer.sequence_length,
input_pp_rank=input_pp_rank,
output_pp_rank=output_pp_rank,
vocab_size=trainer.model_config.vocab_size,
seed=data.seed,
parallel_context=trainer.parallel_context,
)()
# Case 2: HuggingFace datasets
elif isinstance(data.dataset, PretrainDatasetsArgs):
log_rank("Using `datasets` library", logger=logger, level=logging.INFO, rank=0)
tokenizer_path = trainer.config.tokenizer.tokenizer_name_or_path
log_rank(
f"Loading tokenizer from {tokenizer_path} and transformers/hf_hub versions {tf_version, hf_hub_version}",
logger=logger,
level=logging.INFO,
rank=0,
)
# We need to the 1st device to process dataset and cache it, then other devices load from cache
with main_rank_first(trainer.parallel_context.world_pg):
# TODO @nouamanetazi: this may timeout before 1st device finishes processing dataset. Can we have a ctxmanager to modify timeout?
# TODO: generalise to include for validation/test splits
# We load the raw dataset
raw_dataset = get_datasets(
hf_dataset_or_datasets=data.dataset.hf_dataset_or_datasets,
hf_dataset_config_name=data.dataset.hf_dataset_config_name,
splits=data.dataset.hf_dataset_splits,
)["train"]
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
# Check that tokenizer's vocab size is smaller than the model's vocab size
assert (
tokenizer.vocab_size <= trainer.model_config.vocab_size
), f"Tokenizer's vocab size ({tokenizer.vocab_size}) is larger than the model's vocab size ({trainer.model_config.vocab_size})"
# We apply the Causal Language Modeling preprocessing
train_dataset = clm_process(
raw_dataset=raw_dataset,
tokenizer=tokenizer,
text_column_name=data.dataset.text_column_name,
dataset_processing_num_proc_per_process=data.dataset.dataset_processing_num_proc_per_process,
dataset_overwrite_cache=data.dataset.dataset_overwrite_cache,
sequence_length=trainer.sequence_length,
)
# We load the processed dataset on the ranks requiring it
dataloader = get_train_dataloader(
train_dataset=train_dataset,
sequence_length=trainer.sequence_length,
parallel_context=trainer.parallel_context,
input_pp_rank=input_pp_rank,
output_pp_rank=output_pp_rank,
micro_batch_size=trainer.micro_batch_size,
consumed_train_samples=consumed_train_samples,
dataloader_num_workers=data.num_loading_workers,
seed_worker=data.seed,
dataloader_drop_last=True,
)
# Check if we have enough samples for train_steps
total_tokens_dataset = len(dataloader.dataset) * trainer.sequence_length
num_tokens_needed_for_training = (
num_remaining_train_steps * trainer.global_batch_size * trainer.sequence_length
)
assert num_tokens_needed_for_training <= total_tokens_dataset, (
f"Dataset is too small for steps ({total_tokens_dataset} < {num_tokens_needed_for_training}), "
f"Try train_steps<={len(dataloader.dataset) // trainer.global_batch_size + trainer.iteration_step}"
)
# Case 3: Nanosets
elif isinstance(data.dataset, NanosetDatasetsArgs):
# Get tokenizer cardinality
tokenizer = AutoTokenizer.from_pretrained(trainer.config.tokenizer.tokenizer_name_or_path)
token_size = 4 if len(tokenizer) > np.iinfo(np.uint16).max + 1 else 2
del tokenizer
# Create Nanoset
from nanotron.data.nanoset import Nanoset
with main_rank_first(trainer.parallel_context.world_pg):
train_dataset = Nanoset(
dataset_folders=data.dataset.dataset_folder,
dataset_weights=data.dataset.dataset_weights,
sequence_length=trainer.sequence_length,
token_size=token_size,
train_split_num_samples=trainer.config.tokens.train_steps * trainer.global_batch_size,
random_seed=data.seed,
)
# Prepare dataloader
train_dataloader = build_nanoset_dataloader(
train_dataset,
trainer.sequence_length,
parallel_context=trainer.parallel_context,
input_pp_rank=input_pp_rank,
output_pp_rank=output_pp_rank,
micro_batch_size=trainer.micro_batch_size,
consumed_train_samples=consumed_train_samples,
dataloader_num_workers=data.num_loading_workers,
dataloader_drop_last=True,
)
return train_dataloader
else:
raise ValueError(f"Unhandled case of `self.config.data.dataset`. Got: {data.dataset}")
return dataloader
def get_dataloader(trainer: DistributedTrainer) -> Dict[str, DataLoader]:
dataloaders = {}
for stage_idx, stage in enumerate(trainer.config.data_stages):
# NOTE: we only create the dataloader for the first stage,
# then we lazy initialize the dataloader for the other stages
stage = cast(DatasetStageArgs, stage)
consumed_train_samples = get_consumed_train_samples_of_a_data_stage_from_ckp(stage, trainer.metadata)
assert (
consumed_train_samples is not None
), f"Cannot find consumed_train_samples for stage {stage.start_training_step} in the checkpoint"
num_remaining_train_steps = compute_remain_train_steps_of_a_data_stage_from_ckp(
stage, trainer.config, trainer.metadata
)
log_rank(
f"[Training Plan] Stage {stage.name} has {num_remaining_train_steps} remaining training steps and has consumed {consumed_train_samples} samples",
logger=logger,
level=logging.INFO,
rank=0,
)
dataloader = (
get_dataloader_from_data_stage(
trainer,
stage.data,
consumed_train_samples=consumed_train_samples,
num_remaining_train_steps=num_remaining_train_steps,
)
if stage_idx == 0
else lambda stage=stage: get_dataloader_from_data_stage(
trainer,
stage.data,
consumed_train_samples=consumed_train_samples,
num_remaining_train_steps=num_remaining_train_steps,
)
)
dataloaders[stage.name] = dataloader
return dataloaders
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--config-file", type=str, required=True, help="Path to the YAML or python config file")
return parser.parse_args()
if __name__ == "__main__":
args = get_args()
config_file = args.config_file
# Load trainer and data
trainer = DistributedTrainer(config_file)
dataloader = get_dataloader(trainer)
# Train
trainer.train(dataloader)
"""Fixes the problem where '{type.value}_{suffix_name}.safetensors' was duplicated in checkpoint files
For example this script will change the following:
```
checkpoints/10/model/model/decoder/0/pp_block/attn/o_proj/model_model_weight.safetensors_pp-rank-0-of-1_tp-rank-0-of-2.safetensors
to
checkpoints/10/model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-2.safetensors
```
Example Usage:
python scripts/fix_checkpoint_bad_naming.py /fsx/nouamane/projects/nanotron/checkpoints/10
"""
import argparse
import os
import re
from pathlib import Path
def update_checkpoint(checkpoint_dir: str):
print(f"Updating checkpoint in {checkpoint_dir}")
for root, _, files in os.walk(checkpoint_dir):
for file in files:
if file.endswith(".safetensors"):
# r'(?<=model)_(model)' means match the string '_model' that is preceded by 'model'
if len(re.findall(r"(?<=model)_(model)", file)) == 0:
continue
# we remove second _model
new_file = re.sub(r"(?<=model)_(model)", "", file)
# we would have "model_weight.safetensors_pp-rank-0-of-1_tp-rank-0-of-2.safetensors"
# let's assert we have two matches of ".safetensors"
assert len(re.findall(r".safetensors", new_file)) == 2
# then we remove first match
new_file = re.sub(r".safetensors", "", new_file, count=1)
# so that we get "model_weight_pp-rank-0-of-1_tp-rank-0-of-2.safetensors"
print(f"Renaming {file} to {new_file}")
os.rename(os.path.join(root, file), os.path.join(root, new_file))
def main():
parser = argparse.ArgumentParser(description="Update checkpoint from 1.3 to 1.4")
parser.add_argument("checkpoint_dir", type=Path, help="Path to the checkpoint directory")
args = parser.parse_args()
update_checkpoint(args.checkpoint_dir)
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment