START TIME: Fri Mar 15 10:33:15 CST 2024 [2024-03-15 10:33:20,407] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) [2024-03-15 10:33:25,655] [INFO] [runner.py:463:main] Using IP address of 10.3.6.78 for node c06r4n17 [2024-03-15 10:33:25,657] [INFO] [multinode_runner.py:72:get_cmd] Running on the following workers: c06r4n17,c06r4n18 [2024-03-15 10:33:25,657] [INFO] [runner.py:570:main] cmd = pdsh -S -f 1024 -w c06r4n17,c06r4n18 export UCX_MAX_EAGER_LANES=4; export UCX_MAX_RNDV_LANES=4; export UCX_ZCOPY_THRESH=auto; export UCX_WARN_UNUSED_ENV_VARS=n; export UCX_RNDV_THRESH=auto; export NCCL_IB_TIMEOUT=22; export UCX_IB_PCI_BW=mlx5_0:50Gbs,mlx5_1:50Gbs,mlx5_2:50Gbs,mlx5_3:50Gbs; export UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1; export PYTHONPATH=/work/home/liangjing/LLM/LLaMA-Factory-main; cd /work/home/liangjing/LLM/LLaMA-Factory-main; /work/home/liangjing/anaconda3/envs/torch2.1/bin/python -u -m deepspeed.launcher.launch --world_info=eyJjMDZyNG4xNyI6IFswLCAxLCAyLCAzXSwgImMwNnI0bjE4IjogWzAsIDEsIDIsIDNdfQ== --node_rank=%n --master_addr=10.3.6.78 --master_port=29500 src/train_bash.py --stage 'sft' --do_train --template 'llama2' --dataset 'alpaca_gpt4_en,alpaca_gpt4_zh' --finetuning_type 'full' --model_name_or_path '/work/home/liangjing/.cache/modelscope/hub/skyline2006/llama-7b' --output_dir '/work/share/huchen1/liangjj/llama_factory' --per_device_train_batch_size '1' --per_device_eval_batch_size '1' --gradient_accumulation_steps '1' --preprocessing_num_workers '2' --lr_scheduler_type 'cosine' --logging_steps '10' --save_steps '100' --eval_steps '100' --learning_rate '5e-5' --max_grad_norm '0.5' --num_train_epochs '4.0' --val_size '0.01' --evaluation_strategy 'steps' --load_best_model_at_end --weight_decay '0.' --warmup_ratio '0.03' --plot_loss --fp16 --save_on_each_node --deepspeed 'deepspeed.json' c06r4n17: [2024-03-15 10:33:31,800] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) c06r4n17: [2024-03-15 10:33:33,251] [INFO] [launch.py:138:main] 0 NCCL_IB_TIMEOUT=22 c06r4n17: [2024-03-15 10:33:33,251] [INFO] [launch.py:145:main] WORLD INFO DICT: {'c06r4n17': [0, 1, 2, 3], 'c06r4n18': [0, 1, 2, 3]} c06r4n17: [2024-03-15 10:33:33,251] [INFO] [launch.py:151:main] nnodes=2, num_local_procs=4, node_rank=0 c06r4n17: [2024-03-15 10:33:33,252] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(, {'c06r4n17': [0, 1, 2, 3], 'c06r4n18': [4, 5, 6, 7]}) c06r4n17: [2024-03-15 10:33:33,252] [INFO] [launch.py:163:main] dist_world_size=8 c06r4n17: [2024-03-15 10:33:33,252] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3 c06r4n18: [2024-03-15 10:33:34,520] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) c06r4n18: [2024-03-15 10:33:36,904] [INFO] [launch.py:138:main] 1 NCCL_IB_TIMEOUT=22 c06r4n18: [2024-03-15 10:33:36,904] [INFO] [launch.py:145:main] WORLD INFO DICT: {'c06r4n17': [0, 1, 2, 3], 'c06r4n18': [0, 1, 2, 3]} c06r4n18: [2024-03-15 10:33:36,904] [INFO] [launch.py:151:main] nnodes=2, num_local_procs=4, node_rank=1 c06r4n18: [2024-03-15 10:33:36,904] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(, {'c06r4n17': [0, 1, 2, 3], 'c06r4n18': [4, 5, 6, 7]}) c06r4n18: [2024-03-15 10:33:36,904] [INFO] [launch.py:163:main] dist_world_size=8 c06r4n18: [2024-03-15 10:33:36,904] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3 c06r4n17: Traceback (most recent call last): c06r4n17: File "src/train_bash.py", line 1, in c06r4n17: from llmtuner import run_exp c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in c06r4n17: from .api import create_app c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in c06r4n17: from .app import create_app c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in c06r4n17: from ..chat import ChatModel c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in c06r4n17: from .chat_model import ChatModel c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in c06r4n17: from ..model import dispatch_model, load_model_and_tokenizer c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in c06r4n17: from .loader import load_model_and_tokenizer c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in c06r4n17: from trl import AutoModelForCausalLMWithValueHead c06r4n17: ModuleNotFoundError: No module named 'trl' c06r4n17: Traceback (most recent call last): c06r4n17: File "src/train_bash.py", line 1, in c06r4n17: from llmtuner import run_exp c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in c06r4n17: from .api import create_app c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in c06r4n17: from .app import create_app c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in c06r4n17: from ..chat import ChatModel c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in c06r4n17: from .chat_model import ChatModel c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in c06r4n17: from ..model import dispatch_model, load_model_and_tokenizer c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in c06r4n17: from .loader import load_model_and_tokenizer c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in c06r4n17: from trl import AutoModelForCausalLMWithValueHead c06r4n17: ModuleNotFoundError: No module named 'trl' c06r4n17: Traceback (most recent call last): c06r4n17: File "src/train_bash.py", line 1, in c06r4n17: from llmtuner import run_exp c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in c06r4n17: from .api import create_app c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in c06r4n17: from .app import create_app c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in c06r4n17: from ..chat import ChatModel c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in c06r4n17: from .chat_model import ChatModel c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in c06r4n17: from ..model import dispatch_model, load_model_and_tokenizer c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in c06r4n17: from .loader import load_model_and_tokenizer c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in c06r4n17: from trl import AutoModelForCausalLMWithValueHead c06r4n17: ModuleNotFoundError: No module named 'trl' c06r4n17: Traceback (most recent call last): c06r4n17: File "src/train_bash.py", line 1, in c06r4n17: from llmtuner import run_exp c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in c06r4n17: from .api import create_app c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in c06r4n17: from .app import create_app c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in c06r4n17: from ..chat import ChatModel c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in c06r4n17: from .chat_model import ChatModel c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in c06r4n17: from ..model import dispatch_model, load_model_and_tokenizer c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in c06r4n17: from .loader import load_model_and_tokenizer c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in c06r4n17: from trl import AutoModelForCausalLMWithValueHead c06r4n17: ModuleNotFoundError: No module named 'trl' c06r4n18: Traceback (most recent call last): c06r4n18: File "src/train_bash.py", line 1, in c06r4n18: Traceback (most recent call last): c06r4n18: Traceback (most recent call last): c06r4n18: File "src/train_bash.py", line 1, in c06r4n18: Traceback (most recent call last): c06r4n18: File "src/train_bash.py", line 1, in c06r4n18: from llmtuner import run_exp c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in c06r4n18: File "src/train_bash.py", line 1, in c06r4n18: from llmtuner import run_exp c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in c06r4n18: from .api import create_appfrom llmtuner import run_exp c06r4n18: c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in c06r4n18: from .api import create_app c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in c06r4n18: from .app import create_appfrom .api import create_app c06r4n18: c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in c06r4n18: from .app import create_app c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in c06r4n18: from .app import create_app c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in c06r4n18: from ..chat import ChatModel c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in c06r4n18: from ..chat import ChatModel c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in c06r4n18: from ..chat import ChatModelfrom .chat_model import ChatModel c06r4n18: c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in c06r4n18: from .chat_model import ChatModel c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in c06r4n18: from .chat_model import ChatModel c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in c06r4n18: from ..model import dispatch_model, load_model_and_tokenizer c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in c06r4n18: from llmtuner import run_exp from ..model import dispatch_model, load_model_and_tokenizer c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in c06r4n18: c06r4n18: from ..model import dispatch_model, load_model_and_tokenizer c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in c06r4n18: from .api import create_app c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in c06r4n18: from .app import create_app c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in c06r4n18: from ..chat import ChatModel c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in c06r4n18: from .chat_model import ChatModel c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in c06r4n18: from ..model import dispatch_model, load_model_and_tokenizer c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in c06r4n18: from .loader import load_model_and_tokenizer c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in c06r4n18: from .loader import load_model_and_tokenizer c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in c06r4n18: from .loader import load_model_and_tokenizer c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in c06r4n18: from .loader import load_model_and_tokenizer c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in c06r4n18: from trl import AutoModelForCausalLMWithValueHead c06r4n18: ModuleNotFoundError: No module named 'trl' c06r4n18: from trl import AutoModelForCausalLMWithValueHead c06r4n18: ModuleNotFoundError: No module named 'trl' c06r4n18: from trl import AutoModelForCausalLMWithValueHead c06r4n18: ModuleNotFoundError: No module named 'trl' c06r4n18: from trl import AutoModelForCausalLMWithValueHead c06r4n18: ModuleNotFoundError: No module named 'trl' c06r4n17: [2024-03-15 10:33:50,318] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21792 c06r4n17: [2024-03-15 10:33:50,356] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21793 c06r4n17: [2024-03-15 10:33:50,380] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21794 c06r4n17: [2024-03-15 10:33:50,380] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21795 c06r4n17: [2024-03-15 10:33:50,404] [ERROR] [launch.py:321:sigkill_handler] ['/work/home/liangjing/anaconda3/envs/torch2.1/bin/python', '-u', 'src/train_bash.py', '--local_rank=3', '--stage', 'sft', '--do_train', '--template', 'llama2', '--dataset', 'alpaca_gpt4_en,alpaca_gpt4_zh', '--finetuning_type', 'full', '--model_name_or_path', '/work/home/liangjing/.cache/modelscope/hub/skyline2006/llama-7b', '--output_dir', '/work/share/huchen1/liangjj/llama_factory', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '1', '--preprocessing_num_workers', '2', '--lr_scheduler_type', 'cosine', '--logging_steps', '10', '--save_steps', '100', '--eval_steps', '100', '--learning_rate', '5e-5', '--max_grad_norm', '0.5', '--num_train_epochs', '4.0', '--val_size', '0.01', '--evaluation_strategy', 'steps', '--load_best_model_at_end', '--weight_decay', '0.', '--warmup_ratio', '0.03', '--plot_loss', '--fp16', '--save_on_each_node', '--deepspeed', 'deepspeed.json'] exits with return code = 1 c06r4n18: [2024-03-15 10:33:52,961] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21238 c06r4n18: [2024-03-15 10:33:52,961] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21239 c06r4n18: [2024-03-15 10:33:53,000] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21240 c06r4n18: [2024-03-15 10:33:53,012] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21241 c06r4n18: [2024-03-15 10:33:53,051] [ERROR] [launch.py:321:sigkill_handler] ['/work/home/liangjing/anaconda3/envs/torch2.1/bin/python', '-u', 'src/train_bash.py', '--local_rank=3', '--stage', 'sft', '--do_train', '--template', 'llama2', '--dataset', 'alpaca_gpt4_en,alpaca_gpt4_zh', '--finetuning_type', 'full', '--model_name_or_path', '/work/home/liangjing/.cache/modelscope/hub/skyline2006/llama-7b', '--output_dir', '/work/share/huchen1/liangjj/llama_factory', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '1', '--preprocessing_num_workers', '2', '--lr_scheduler_type', 'cosine', '--logging_steps', '10', '--save_steps', '100', '--eval_steps', '100', '--learning_rate', '5e-5', '--max_grad_norm', '0.5', '--num_train_epochs', '4.0', '--val_size', '0.01', '--evaluation_strategy', 'steps', '--load_best_model_at_end', '--weight_decay', '0.', '--warmup_ratio', '0.03', '--plot_loss', '--fp16', '--save_on_each_node', '--deepspeed', 'deepspeed.json'] exits with return code = 1 pdsh@c06r4n17: c06r4n17: ssh exited with exit code 1 pdsh@c06r4n17: c06r4n18: ssh exited with exit code 1