# CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=1 run_train.py --config-file smollm1/config_smollm1_135M_demo1.yaml CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=1 run_train.py --config-file smollm1/config_smollm1_135M_demo2.yaml