train.sh

# --nproc_per_node=8:dp=2, pp=2, and tp=2
# --nproc_per_node=4:dp=1, pp=2, and tp=2
# --nproc_per_node=1:dp=1, pp=1, and tp=1

CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=4 run_train.py --config-file examples/config_tiny_llama.yaml
# CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=4 run_train.py --config-file examples/config_tiny_llama_cosmo2tokenizer.yaml

# CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=4 run_train.py --config-file examples/config_llama3_dummytokenizer.yaml
# CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=4 run_train.py --config-file examples/config_llama3.yaml