13597831.out 17 KB
Newer Older
wanglch's avatar
wanglch committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
START TIME: Fri Mar 15 10:33:15 CST 2024
[2024-03-15 10:33:20,407] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-03-15 10:33:25,655] [INFO] [runner.py:463:main] Using IP address of 10.3.6.78 for node c06r4n17
[2024-03-15 10:33:25,657] [INFO] [multinode_runner.py:72:get_cmd] Running on the following workers: c06r4n17,c06r4n18
[2024-03-15 10:33:25,657] [INFO] [runner.py:570:main] cmd = pdsh -S -f 1024 -w c06r4n17,c06r4n18 export UCX_MAX_EAGER_LANES=4; export UCX_MAX_RNDV_LANES=4; export UCX_ZCOPY_THRESH=auto; export UCX_WARN_UNUSED_ENV_VARS=n; export UCX_RNDV_THRESH=auto; export NCCL_IB_TIMEOUT=22; export UCX_IB_PCI_BW=mlx5_0:50Gbs,mlx5_1:50Gbs,mlx5_2:50Gbs,mlx5_3:50Gbs; export UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1; export PYTHONPATH=/work/home/liangjing/LLM/LLaMA-Factory-main;  cd /work/home/liangjing/LLM/LLaMA-Factory-main; /work/home/liangjing/anaconda3/envs/torch2.1/bin/python -u -m deepspeed.launcher.launch --world_info=eyJjMDZyNG4xNyI6IFswLCAxLCAyLCAzXSwgImMwNnI0bjE4IjogWzAsIDEsIDIsIDNdfQ== --node_rank=%n --master_addr=10.3.6.78 --master_port=29500 src/train_bash.py --stage 'sft' --do_train --template 'llama2' --dataset 'alpaca_gpt4_en,alpaca_gpt4_zh' --finetuning_type 'full' --model_name_or_path '/work/home/liangjing/.cache/modelscope/hub/skyline2006/llama-7b' --output_dir '/work/share/huchen1/liangjj/llama_factory' --per_device_train_batch_size '1' --per_device_eval_batch_size '1' --gradient_accumulation_steps '1' --preprocessing_num_workers '2' --lr_scheduler_type 'cosine' --logging_steps '10' --save_steps '100' --eval_steps '100' --learning_rate '5e-5' --max_grad_norm '0.5' --num_train_epochs '4.0' --val_size '0.01' --evaluation_strategy 'steps' --load_best_model_at_end --weight_decay '0.' --warmup_ratio '0.03' --plot_loss --fp16 --save_on_each_node --deepspeed 'deepspeed.json'
c06r4n17: [2024-03-15 10:33:31,800] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
c06r4n17: [2024-03-15 10:33:33,251] [INFO] [launch.py:138:main] 0 NCCL_IB_TIMEOUT=22
c06r4n17: [2024-03-15 10:33:33,251] [INFO] [launch.py:145:main] WORLD INFO DICT: {'c06r4n17': [0, 1, 2, 3], 'c06r4n18': [0, 1, 2, 3]}
c06r4n17: [2024-03-15 10:33:33,251] [INFO] [launch.py:151:main] nnodes=2, num_local_procs=4, node_rank=0
c06r4n17: [2024-03-15 10:33:33,252] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(<class 'list'>, {'c06r4n17': [0, 1, 2, 3], 'c06r4n18': [4, 5, 6, 7]})
c06r4n17: [2024-03-15 10:33:33,252] [INFO] [launch.py:163:main] dist_world_size=8
c06r4n17: [2024-03-15 10:33:33,252] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3
c06r4n18: [2024-03-15 10:33:34,520] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
c06r4n18: [2024-03-15 10:33:36,904] [INFO] [launch.py:138:main] 1 NCCL_IB_TIMEOUT=22
c06r4n18: [2024-03-15 10:33:36,904] [INFO] [launch.py:145:main] WORLD INFO DICT: {'c06r4n17': [0, 1, 2, 3], 'c06r4n18': [0, 1, 2, 3]}
c06r4n18: [2024-03-15 10:33:36,904] [INFO] [launch.py:151:main] nnodes=2, num_local_procs=4, node_rank=1
c06r4n18: [2024-03-15 10:33:36,904] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(<class 'list'>, {'c06r4n17': [0, 1, 2, 3], 'c06r4n18': [4, 5, 6, 7]})
c06r4n18: [2024-03-15 10:33:36,904] [INFO] [launch.py:163:main] dist_world_size=8
c06r4n18: [2024-03-15 10:33:36,904] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3
c06r4n17: Traceback (most recent call last):
c06r4n17:   File "src/train_bash.py", line 1, in <module>
c06r4n17:     from llmtuner import run_exp
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n17:     from .api import create_app
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n17:     from .app import create_app
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n17:     from ..chat import ChatModel
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n17:     from .chat_model import ChatModel
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in <module>
c06r4n17:     from ..model import dispatch_model, load_model_and_tokenizer
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in <module>
c06r4n17:     from .loader import load_model_and_tokenizer
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in <module>
c06r4n17:     from trl import AutoModelForCausalLMWithValueHead
c06r4n17: ModuleNotFoundError: No module named 'trl'
c06r4n17: Traceback (most recent call last):
c06r4n17:   File "src/train_bash.py", line 1, in <module>
c06r4n17:     from llmtuner import run_exp
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n17:     from .api import create_app
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n17:     from .app import create_app
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n17:     from ..chat import ChatModel
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n17:     from .chat_model import ChatModel
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in <module>
c06r4n17:     from ..model import dispatch_model, load_model_and_tokenizer
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in <module>
c06r4n17:     from .loader import load_model_and_tokenizer
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in <module>
c06r4n17:     from trl import AutoModelForCausalLMWithValueHead
c06r4n17: ModuleNotFoundError: No module named 'trl'
c06r4n17: Traceback (most recent call last):
c06r4n17:   File "src/train_bash.py", line 1, in <module>
c06r4n17:     from llmtuner import run_exp
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n17:     from .api import create_app
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n17:     from .app import create_app
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n17:     from ..chat import ChatModel
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n17:     from .chat_model import ChatModel
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in <module>
c06r4n17:     from ..model import dispatch_model, load_model_and_tokenizer
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in <module>
c06r4n17:     from .loader import load_model_and_tokenizer
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in <module>
c06r4n17:     from trl import AutoModelForCausalLMWithValueHead
c06r4n17: ModuleNotFoundError: No module named 'trl'
c06r4n17: Traceback (most recent call last):
c06r4n17:   File "src/train_bash.py", line 1, in <module>
c06r4n17:     from llmtuner import run_exp
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n17:     from .api import create_app
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n17:     from .app import create_app
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n17:     from ..chat import ChatModel
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n17:     from .chat_model import ChatModel
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in <module>
c06r4n17:     from ..model import dispatch_model, load_model_and_tokenizer
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in <module>
c06r4n17:     from .loader import load_model_and_tokenizer
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in <module>
c06r4n17:     from trl import AutoModelForCausalLMWithValueHead
c06r4n17: ModuleNotFoundError: No module named 'trl'
c06r4n18: Traceback (most recent call last):
c06r4n18:   File "src/train_bash.py", line 1, in <module>
c06r4n18: Traceback (most recent call last):
c06r4n18: Traceback (most recent call last):
c06r4n18:   File "src/train_bash.py", line 1, in <module>
c06r4n18: Traceback (most recent call last):
c06r4n18:   File "src/train_bash.py", line 1, in <module>
c06r4n18:     from llmtuner import run_exp
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n18:   File "src/train_bash.py", line 1, in <module>
c06r4n18:     from llmtuner import run_exp
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n18:         from .api import create_appfrom llmtuner import run_exp
c06r4n18: 
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n18:     from .api import create_app
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n18:         from .app import create_appfrom .api import create_app
c06r4n18: 
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n18:     from .app import create_app
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n18:     from .app import create_app
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n18:     from ..chat import ChatModel
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n18:     from ..chat import ChatModel
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n18:         from ..chat import ChatModelfrom .chat_model import ChatModel
c06r4n18: 
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in <module>
c06r4n18:     from .chat_model import ChatModel
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in <module>
c06r4n18:         from .chat_model import ChatModel
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in <module>
c06r4n18:     from ..model import dispatch_model, load_model_and_tokenizer
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in <module>
c06r4n18: from llmtuner import run_exp    from ..model import dispatch_model, load_model_and_tokenizer
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in <module>
c06r4n18: 
c06r4n18:     from ..model import dispatch_model, load_model_and_tokenizer
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in <module>
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n18:     from .api import create_app
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n18:     from .app import create_app
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n18:     from ..chat import ChatModel
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n18:     from .chat_model import ChatModel
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in <module>
c06r4n18:     from ..model import dispatch_model, load_model_and_tokenizer
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in <module>
c06r4n18:     from .loader import load_model_and_tokenizer
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in <module>
c06r4n18:     from .loader import load_model_and_tokenizer
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in <module>
c06r4n18:     from .loader import load_model_and_tokenizer
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in <module>
c06r4n18:     from .loader import load_model_and_tokenizer
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in <module>
c06r4n18:     from trl import AutoModelForCausalLMWithValueHead
c06r4n18: ModuleNotFoundError: No module named 'trl'
c06r4n18:     from trl import AutoModelForCausalLMWithValueHead
c06r4n18: ModuleNotFoundError: No module named 'trl'
c06r4n18:     from trl import AutoModelForCausalLMWithValueHead
c06r4n18: ModuleNotFoundError: No module named 'trl'
c06r4n18:     from trl import AutoModelForCausalLMWithValueHead
c06r4n18: ModuleNotFoundError: No module named 'trl'
c06r4n17: [2024-03-15 10:33:50,318] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21792
c06r4n17: [2024-03-15 10:33:50,356] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21793
c06r4n17: [2024-03-15 10:33:50,380] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21794
c06r4n17: [2024-03-15 10:33:50,380] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21795
c06r4n17: [2024-03-15 10:33:50,404] [ERROR] [launch.py:321:sigkill_handler] ['/work/home/liangjing/anaconda3/envs/torch2.1/bin/python', '-u', 'src/train_bash.py', '--local_rank=3', '--stage', 'sft', '--do_train', '--template', 'llama2', '--dataset', 'alpaca_gpt4_en,alpaca_gpt4_zh', '--finetuning_type', 'full', '--model_name_or_path', '/work/home/liangjing/.cache/modelscope/hub/skyline2006/llama-7b', '--output_dir', '/work/share/huchen1/liangjj/llama_factory', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '1', '--preprocessing_num_workers', '2', '--lr_scheduler_type', 'cosine', '--logging_steps', '10', '--save_steps', '100', '--eval_steps', '100', '--learning_rate', '5e-5', '--max_grad_norm', '0.5', '--num_train_epochs', '4.0', '--val_size', '0.01', '--evaluation_strategy', 'steps', '--load_best_model_at_end', '--weight_decay', '0.', '--warmup_ratio', '0.03', '--plot_loss', '--fp16', '--save_on_each_node', '--deepspeed', 'deepspeed.json'] exits with return code = 1
c06r4n18: [2024-03-15 10:33:52,961] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21238
c06r4n18: [2024-03-15 10:33:52,961] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21239
c06r4n18: [2024-03-15 10:33:53,000] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21240
c06r4n18: [2024-03-15 10:33:53,012] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21241
c06r4n18: [2024-03-15 10:33:53,051] [ERROR] [launch.py:321:sigkill_handler] ['/work/home/liangjing/anaconda3/envs/torch2.1/bin/python', '-u', 'src/train_bash.py', '--local_rank=3', '--stage', 'sft', '--do_train', '--template', 'llama2', '--dataset', 'alpaca_gpt4_en,alpaca_gpt4_zh', '--finetuning_type', 'full', '--model_name_or_path', '/work/home/liangjing/.cache/modelscope/hub/skyline2006/llama-7b', '--output_dir', '/work/share/huchen1/liangjj/llama_factory', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '1', '--preprocessing_num_workers', '2', '--lr_scheduler_type', 'cosine', '--logging_steps', '10', '--save_steps', '100', '--eval_steps', '100', '--learning_rate', '5e-5', '--max_grad_norm', '0.5', '--num_train_epochs', '4.0', '--val_size', '0.01', '--evaluation_strategy', 'steps', '--load_best_model_at_end', '--weight_decay', '0.', '--warmup_ratio', '0.03', '--plot_loss', '--fp16', '--save_on_each_node', '--deepspeed', 'deepspeed.json'] exits with return code = 1
pdsh@c06r4n17: c06r4n17: ssh exited with exit code 1
pdsh@c06r4n17: c06r4n18: ssh exited with exit code 1