13597689.out 19.2 KB
Newer Older
wanglch's avatar
wanglch committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
START TIME: Fri Mar 15 10:25:18 CST 2024
[2024-03-15 10:26:04,995] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-03-15 10:26:39,246] [INFO] [runner.py:463:main] Using IP address of 10.3.6.78 for node c06r4n17
[2024-03-15 10:26:39,315] [INFO] [multinode_runner.py:72:get_cmd] Running on the following workers: c06r4n17,c06r4n18
[2024-03-15 10:26:39,315] [INFO] [runner.py:570:main] cmd = pdsh -S -f 1024 -w c06r4n17,c06r4n18 export UCX_MAX_EAGER_LANES=4; export UCX_MAX_RNDV_LANES=4; export UCX_ZCOPY_THRESH=auto; export UCX_WARN_UNUSED_ENV_VARS=n; export UCX_RNDV_THRESH=auto; export NCCL_IB_TIMEOUT=22; export UCX_IB_PCI_BW=mlx5_0:50Gbs,mlx5_1:50Gbs,mlx5_2:50Gbs,mlx5_3:50Gbs; export UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1; export PYTHONPATH=/work/home/liangjing/LLM/LLaMA-Factory-main;  cd /work/home/liangjing/LLM/LLaMA-Factory-main; /work/home/liangjing/anaconda3/envs/torch2.1/bin/python -u -m deepspeed.launcher.launch --world_info=eyJjMDZyNG4xNyI6IFswLCAxLCAyLCAzXSwgImMwNnI0bjE4IjogWzAsIDEsIDIsIDNdfQ== --node_rank=%n --master_addr=10.3.6.78 --master_port=29500 src/train_bash.py --stage 'sft' --do_train --template 'llama2' --dataset 'alpaca_gpt4_en,alpaca_gpt4_zh' --finetuning_type 'full' --model_name_or_path '/work/home/liangjing/.cache/modelscope/hub/skyline2006/llama-7b' --output_dir '/work/share/huchen1/liangjj/llama_factory' --per_device_train_batch_size '1' --per_device_eval_batch_size '1' --gradient_accumulation_steps '1' --preprocessing_num_workers '2' --lr_scheduler_type 'cosine' --logging_steps '10' --save_steps '100' --eval_steps '100' --learning_rate '5e-5' --max_grad_norm '0.5' --num_train_epochs '4.0' --val_size '0.01' --evaluation_strategy 'steps' --load_best_model_at_end --weight_decay '0.' --warmup_ratio '0.03' --plot_loss --fp16 --save_on_each_node --deepspeed 'deepspeed.json'
c06r4n17: [2024-03-15 10:26:58,215] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
c06r4n17: [2024-03-15 10:26:59,591] [INFO] [launch.py:138:main] 0 NCCL_IB_TIMEOUT=22
c06r4n17: [2024-03-15 10:26:59,591] [INFO] [launch.py:145:main] WORLD INFO DICT: {'c06r4n17': [0, 1, 2, 3], 'c06r4n18': [0, 1, 2, 3]}
c06r4n17: [2024-03-15 10:26:59,592] [INFO] [launch.py:151:main] nnodes=2, num_local_procs=4, node_rank=0
c06r4n17: [2024-03-15 10:26:59,592] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(<class 'list'>, {'c06r4n17': [0, 1, 2, 3], 'c06r4n18': [4, 5, 6, 7]})
c06r4n17: [2024-03-15 10:26:59,592] [INFO] [launch.py:163:main] dist_world_size=8
c06r4n17: [2024-03-15 10:26:59,592] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3
c06r4n18: [2024-03-15 10:27:17,946] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
c06r4n17: Traceback (most recent call last):
c06r4n17:   File "src/train_bash.py", line 1, in <module>
c06r4n17:     from llmtuner import run_exp
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n17:     from .api import create_app
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n17:     from .app import create_app
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n17:     from ..chat import ChatModel
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n17:     from .chat_model import ChatModel
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 8, in <module>
c06r4n17:     from ..data import get_template_and_fix_tokenizer
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/__init__.py", line 1, in <module>
c06r4n17:     from .loader import get_dataset
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/loader.py", line 10, in <module>
c06r4n17:     from .parser import get_dataset_list
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/parser.py", line 7, in <module>
c06r4n17:     from ..extras.misc import use_modelscope
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/extras/misc.py", line 6, in <module>
c06r4n17:     from peft import PeftModel
c06r4n17: ModuleNotFoundError: No module named 'peft'
c06r4n17: Traceback (most recent call last):
c06r4n17:   File "src/train_bash.py", line 1, in <module>
c06r4n17:     from llmtuner import run_exp
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n17:     from .api import create_app
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n17:     from .app import create_app
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n17:     from ..chat import ChatModel
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n17:     from .chat_model import ChatModel
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 8, in <module>
c06r4n17:     from ..data import get_template_and_fix_tokenizer
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/__init__.py", line 1, in <module>
c06r4n17:     from .loader import get_dataset
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/loader.py", line 10, in <module>
c06r4n17:     from .parser import get_dataset_list
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/parser.py", line 7, in <module>
c06r4n17:     from ..extras.misc import use_modelscope
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/extras/misc.py", line 6, in <module>
c06r4n17:     from peft import PeftModel
c06r4n17: ModuleNotFoundError: No module named 'peft'
c06r4n17: Traceback (most recent call last):
c06r4n17:   File "src/train_bash.py", line 1, in <module>
c06r4n17:     from llmtuner import run_exp
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n17:     from .api import create_app
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n17:     from .app import create_app
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n17:     from ..chat import ChatModel
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n17:     from .chat_model import ChatModel
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 8, in <module>
c06r4n17:     from ..data import get_template_and_fix_tokenizer
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/__init__.py", line 1, in <module>
c06r4n17:     from .loader import get_dataset
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/loader.py", line 10, in <module>
c06r4n17:     from .parser import get_dataset_list
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/parser.py", line 7, in <module>
c06r4n17:     from ..extras.misc import use_modelscope
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/extras/misc.py", line 6, in <module>
c06r4n17:     from peft import PeftModel
c06r4n17: ModuleNotFoundError: No module named 'peft'
c06r4n17: Traceback (most recent call last):
c06r4n17:   File "src/train_bash.py", line 1, in <module>
c06r4n17:     from llmtuner import run_exp
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n17:     from .api import create_app
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n17:     from .app import create_app
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n17:     from ..chat import ChatModel
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n17:     from .chat_model import ChatModel
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 8, in <module>
c06r4n17:     from ..data import get_template_and_fix_tokenizer
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/__init__.py", line 1, in <module>
c06r4n17:     from .loader import get_dataset
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/loader.py", line 10, in <module>
c06r4n17:     from .parser import get_dataset_list
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/parser.py", line 7, in <module>
c06r4n17:     from ..extras.misc import use_modelscope
c06r4n17:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/extras/misc.py", line 6, in <module>
c06r4n17:     from peft import PeftModel
c06r4n17: ModuleNotFoundError: No module named 'peft'
c06r4n17: [2024-03-15 10:27:23,658] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 16379
c06r4n17: [2024-03-15 10:27:23,659] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 16380
c06r4n17: [2024-03-15 10:27:23,684] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 16381
c06r4n17: [2024-03-15 10:27:23,708] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 16382
c06r4n17: [2024-03-15 10:27:23,732] [ERROR] [launch.py:321:sigkill_handler] ['/work/home/liangjing/anaconda3/envs/torch2.1/bin/python', '-u', 'src/train_bash.py', '--local_rank=3', '--stage', 'sft', '--do_train', '--template', 'llama2', '--dataset', 'alpaca_gpt4_en,alpaca_gpt4_zh', '--finetuning_type', 'full', '--model_name_or_path', '/work/home/liangjing/.cache/modelscope/hub/skyline2006/llama-7b', '--output_dir', '/work/share/huchen1/liangjj/llama_factory', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '1', '--preprocessing_num_workers', '2', '--lr_scheduler_type', 'cosine', '--logging_steps', '10', '--save_steps', '100', '--eval_steps', '100', '--learning_rate', '5e-5', '--max_grad_norm', '0.5', '--num_train_epochs', '4.0', '--val_size', '0.01', '--evaluation_strategy', 'steps', '--load_best_model_at_end', '--weight_decay', '0.', '--warmup_ratio', '0.03', '--plot_loss', '--fp16', '--save_on_each_node', '--deepspeed', 'deepspeed.json'] exits with return code = 1
pdsh@c06r4n17: c06r4n17: ssh exited with exit code 1
c06r4n18: [2024-03-15 10:27:41,876] [INFO] [launch.py:138:main] 1 NCCL_IB_TIMEOUT=22
c06r4n18: [2024-03-15 10:27:41,876] [INFO] [launch.py:145:main] WORLD INFO DICT: {'c06r4n17': [0, 1, 2, 3], 'c06r4n18': [0, 1, 2, 3]}
c06r4n18: [2024-03-15 10:27:41,876] [INFO] [launch.py:151:main] nnodes=2, num_local_procs=4, node_rank=1
c06r4n18: [2024-03-15 10:27:41,876] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(<class 'list'>, {'c06r4n17': [0, 1, 2, 3], 'c06r4n18': [4, 5, 6, 7]})
c06r4n18: [2024-03-15 10:27:41,876] [INFO] [launch.py:163:main] dist_world_size=8
c06r4n18: [2024-03-15 10:27:41,876] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3
c06r4n18: Traceback (most recent call last):
c06r4n18:   File "src/train_bash.py", line 1, in <module>
c06r4n18: Traceback (most recent call last):
c06r4n18:   File "src/train_bash.py", line 1, in <module>
c06r4n18:     from llmtuner import run_exp
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n18: Traceback (most recent call last):
c06r4n18: Traceback (most recent call last):
c06r4n18:   File "src/train_bash.py", line 1, in <module>
c06r4n18:     from llmtuner import run_exp
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n18:     from .api import create_app
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n18:   File "src/train_bash.py", line 1, in <module>
c06r4n18:     from .api import create_app
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n18:     from llmtuner import run_exp
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n18:     from .app import create_app
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n18:     from .app import create_app
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n18:     from .api import create_app
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n18:     from ..chat import ChatModel
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n18:     from .app import create_app
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n18:     from ..chat import ChatModel
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n18:         from .chat_model import ChatModel
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 8, in <module>
c06r4n18:     from .chat_model import ChatModel
c06r4n18:       File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 8, in <module>
c06r4n18: from ..chat import ChatModel
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n18: from llmtuner import run_exp
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n18:     from .chat_model import ChatModel
c06r4n18:       File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 8, in <module>
c06r4n18: from ..data import get_template_and_fix_tokenizer
c06r4n18:     from ..data import get_template_and_fix_tokenizer  File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/__init__.py", line 1, in <module>
c06r4n18: 
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/__init__.py", line 1, in <module>
c06r4n18:     from ..data import get_template_and_fix_tokenizer
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/__init__.py", line 1, in <module>
c06r4n18:     from .api import create_app
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n18:     from .app import create_app
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n18:     from ..chat import ChatModel
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n18:     from .chat_model import ChatModel
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 8, in <module>
c06r4n18:     from ..data import get_template_and_fix_tokenizer
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/__init__.py", line 1, in <module>
c06r4n18:     from .loader import get_dataset
c06r4n18:     from .loader import get_dataset  File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/loader.py", line 10, in <module>
c06r4n18: 
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/loader.py", line 10, in <module>
c06r4n18:     from .loader import get_dataset
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/loader.py", line 10, in <module>
c06r4n18:     from .loader import get_dataset
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/loader.py", line 10, in <module>
c06r4n18:     from .parser import get_dataset_list
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/parser.py", line 7, in <module>
c06r4n18:     from .parser import get_dataset_list
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/parser.py", line 7, in <module>
c06r4n18:     from .parser import get_dataset_list
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/parser.py", line 7, in <module>
c06r4n18:     from .parser import get_dataset_list
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/parser.py", line 7, in <module>
c06r4n18:     from ..extras.misc import use_modelscope
c06r4n18:     from ..extras.misc import use_modelscope  File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/extras/misc.py", line 6, in <module>
c06r4n18: 
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/extras/misc.py", line 6, in <module>
c06r4n18:     from ..extras.misc import use_modelscope
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/extras/misc.py", line 6, in <module>
c06r4n18:     from ..extras.misc import use_modelscope
c06r4n18:   File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/extras/misc.py", line 6, in <module>
c06r4n18:     from peft import PeftModel
c06r4n18: ModuleNotFoundError: No module named 'peft'
c06r4n18:     from peft import PeftModel
c06r4n18: ModuleNotFoundError: No module named 'peft'
c06r4n18:     from peft import PeftModel
c06r4n18: ModuleNotFoundError: No module named 'peft'
c06r4n18:     from peft import PeftModel
c06r4n18: ModuleNotFoundError: No module named 'peft'
c06r4n18: [2024-03-15 10:28:15,961] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 16721
c06r4n18: [2024-03-15 10:28:16,002] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 16722
c06r4n18: [2024-03-15 10:28:16,040] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 16723
c06r4n18: [2024-03-15 10:28:16,040] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 16724
c06r4n18: [2024-03-15 10:28:16,079] [ERROR] [launch.py:321:sigkill_handler] ['/work/home/liangjing/anaconda3/envs/torch2.1/bin/python', '-u', 'src/train_bash.py', '--local_rank=3', '--stage', 'sft', '--do_train', '--template', 'llama2', '--dataset', 'alpaca_gpt4_en,alpaca_gpt4_zh', '--finetuning_type', 'full', '--model_name_or_path', '/work/home/liangjing/.cache/modelscope/hub/skyline2006/llama-7b', '--output_dir', '/work/share/huchen1/liangjj/llama_factory', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '1', '--preprocessing_num_workers', '2', '--lr_scheduler_type', 'cosine', '--logging_steps', '10', '--save_steps', '100', '--eval_steps', '100', '--learning_rate', '5e-5', '--max_grad_norm', '0.5', '--num_train_epochs', '4.0', '--val_size', '0.01', '--evaluation_strategy', 'steps', '--load_best_model_at_end', '--weight_decay', '0.', '--warmup_ratio', '0.03', '--plot_loss', '--fp16', '--save_on_each_node', '--deepspeed', 'deepspeed.json'] exits with return code = 1
pdsh@c06r4n17: c06r4n18: ssh exited with exit code 1