Commit afe180a6 authored by wanglch's avatar wanglch
Browse files

Initial commit

parents
Pipeline #1006 canceled with stages
#!/bin/bash
CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
--stage sft \
--do_train \
--model_name_or_path meta-llama/Llama-2-7b-hf \
--dataset alpaca_gpt4_en,glaive_toolcall \
--dataset_dir ../../data \
--template default \
--finetuning_type lora \
--lora_target q_proj,v_proj \
--output_dir ../../saves/LLaMA2-7B/lora/sft \
--overwrite_cache \
--overwrite_output_dir \
--cutoff_len 1024 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 8 \
--lr_scheduler_type cosine \
--logging_steps 10 \
--save_steps 100 \
--eval_steps 100 \
--evaluation_strategy steps \
--load_best_model_at_end \
--learning_rate 5e-5 \
--num_train_epochs 3.0 \
--max_samples 3000 \
--val_size 0.1 \
--quantization_bit 4 \
--plot_loss \
--fp16
#!/bin/bash
CUDA_VISIBLE_DEVICES=0 python ../../src/train_bash.py \
--stage sft \
--do_train \
--model_name_or_path TheBloke/Llama-2-7B-GPTQ \
--dataset alpaca_gpt4_en,glaive_toolcall \
--dataset_dir ../../data \
--template default \
--finetuning_type lora \
--lora_target q_proj,v_proj \
--output_dir ../../saves/LLaMA2-7B/lora/sft \
--overwrite_cache \
--overwrite_output_dir \
--cutoff_len 1024 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 8 \
--lr_scheduler_type cosine \
--logging_steps 10 \
--save_steps 100 \
--eval_steps 100 \
--evaluation_strategy steps \
--load_best_model_at_end \
--learning_rate 5e-5 \
--num_train_epochs 3.0 \
--max_samples 3000 \
--val_size 0.1 \
--plot_loss \
--fp16
c06r3n06
c06r3n07
c06r3n08
c06r3n09
c06r3n06
c06r3n07
c06r3n08
c06r3n09
c06r3n06
c06r3n07
c06r3n08
c06r3n09
c06r4n17 slots=4
c06r4n18 slots=4
c06r4n17 slots=4
c06r4n18 slots=4
c06r3n06 slots=4
c06r3n07 slots=4
c06r3n08 slots=4
c06r3n09 slots=4
c06r3n06 slots=4
c06r3n07 slots=4
c06r3n08 slots=4
c06r3n09 slots=4
c06r3n06 slots=4
c06r3n07 slots=4
c06r3n08 slots=4
c06r3n09 slots=4
c06r3n06 slots=4
c06r3n07 slots=4
c06r3n08 slots=4
c06r3n09 slots=4
START TIME: Fri Mar 15 10:25:18 CST 2024
[2024-03-15 10:26:04,995] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-03-15 10:26:39,246] [INFO] [runner.py:463:main] Using IP address of 10.3.6.78 for node c06r4n17
[2024-03-15 10:26:39,315] [INFO] [multinode_runner.py:72:get_cmd] Running on the following workers: c06r4n17,c06r4n18
[2024-03-15 10:26:39,315] [INFO] [runner.py:570:main] cmd = pdsh -S -f 1024 -w c06r4n17,c06r4n18 export UCX_MAX_EAGER_LANES=4; export UCX_MAX_RNDV_LANES=4; export UCX_ZCOPY_THRESH=auto; export UCX_WARN_UNUSED_ENV_VARS=n; export UCX_RNDV_THRESH=auto; export NCCL_IB_TIMEOUT=22; export UCX_IB_PCI_BW=mlx5_0:50Gbs,mlx5_1:50Gbs,mlx5_2:50Gbs,mlx5_3:50Gbs; export UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1; export PYTHONPATH=/work/home/liangjing/LLM/LLaMA-Factory-main; cd /work/home/liangjing/LLM/LLaMA-Factory-main; /work/home/liangjing/anaconda3/envs/torch2.1/bin/python -u -m deepspeed.launcher.launch --world_info=eyJjMDZyNG4xNyI6IFswLCAxLCAyLCAzXSwgImMwNnI0bjE4IjogWzAsIDEsIDIsIDNdfQ== --node_rank=%n --master_addr=10.3.6.78 --master_port=29500 src/train_bash.py --stage 'sft' --do_train --template 'llama2' --dataset 'alpaca_gpt4_en,alpaca_gpt4_zh' --finetuning_type 'full' --model_name_or_path '/work/home/liangjing/.cache/modelscope/hub/skyline2006/llama-7b' --output_dir '/work/share/huchen1/liangjj/llama_factory' --per_device_train_batch_size '1' --per_device_eval_batch_size '1' --gradient_accumulation_steps '1' --preprocessing_num_workers '2' --lr_scheduler_type 'cosine' --logging_steps '10' --save_steps '100' --eval_steps '100' --learning_rate '5e-5' --max_grad_norm '0.5' --num_train_epochs '4.0' --val_size '0.01' --evaluation_strategy 'steps' --load_best_model_at_end --weight_decay '0.' --warmup_ratio '0.03' --plot_loss --fp16 --save_on_each_node --deepspeed 'deepspeed.json'
c06r4n17: [2024-03-15 10:26:58,215] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
c06r4n17: [2024-03-15 10:26:59,591] [INFO] [launch.py:138:main] 0 NCCL_IB_TIMEOUT=22
c06r4n17: [2024-03-15 10:26:59,591] [INFO] [launch.py:145:main] WORLD INFO DICT: {'c06r4n17': [0, 1, 2, 3], 'c06r4n18': [0, 1, 2, 3]}
c06r4n17: [2024-03-15 10:26:59,592] [INFO] [launch.py:151:main] nnodes=2, num_local_procs=4, node_rank=0
c06r4n17: [2024-03-15 10:26:59,592] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(<class 'list'>, {'c06r4n17': [0, 1, 2, 3], 'c06r4n18': [4, 5, 6, 7]})
c06r4n17: [2024-03-15 10:26:59,592] [INFO] [launch.py:163:main] dist_world_size=8
c06r4n17: [2024-03-15 10:26:59,592] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3
c06r4n18: [2024-03-15 10:27:17,946] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
c06r4n17: Traceback (most recent call last):
c06r4n17: File "src/train_bash.py", line 1, in <module>
c06r4n17: from llmtuner import run_exp
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n17: from .api import create_app
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n17: from .app import create_app
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n17: from ..chat import ChatModel
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n17: from .chat_model import ChatModel
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 8, in <module>
c06r4n17: from ..data import get_template_and_fix_tokenizer
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/__init__.py", line 1, in <module>
c06r4n17: from .loader import get_dataset
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/loader.py", line 10, in <module>
c06r4n17: from .parser import get_dataset_list
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/parser.py", line 7, in <module>
c06r4n17: from ..extras.misc import use_modelscope
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/extras/misc.py", line 6, in <module>
c06r4n17: from peft import PeftModel
c06r4n17: ModuleNotFoundError: No module named 'peft'
c06r4n17: Traceback (most recent call last):
c06r4n17: File "src/train_bash.py", line 1, in <module>
c06r4n17: from llmtuner import run_exp
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n17: from .api import create_app
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n17: from .app import create_app
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n17: from ..chat import ChatModel
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n17: from .chat_model import ChatModel
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 8, in <module>
c06r4n17: from ..data import get_template_and_fix_tokenizer
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/__init__.py", line 1, in <module>
c06r4n17: from .loader import get_dataset
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/loader.py", line 10, in <module>
c06r4n17: from .parser import get_dataset_list
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/parser.py", line 7, in <module>
c06r4n17: from ..extras.misc import use_modelscope
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/extras/misc.py", line 6, in <module>
c06r4n17: from peft import PeftModel
c06r4n17: ModuleNotFoundError: No module named 'peft'
c06r4n17: Traceback (most recent call last):
c06r4n17: File "src/train_bash.py", line 1, in <module>
c06r4n17: from llmtuner import run_exp
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n17: from .api import create_app
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n17: from .app import create_app
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n17: from ..chat import ChatModel
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n17: from .chat_model import ChatModel
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 8, in <module>
c06r4n17: from ..data import get_template_and_fix_tokenizer
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/__init__.py", line 1, in <module>
c06r4n17: from .loader import get_dataset
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/loader.py", line 10, in <module>
c06r4n17: from .parser import get_dataset_list
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/parser.py", line 7, in <module>
c06r4n17: from ..extras.misc import use_modelscope
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/extras/misc.py", line 6, in <module>
c06r4n17: from peft import PeftModel
c06r4n17: ModuleNotFoundError: No module named 'peft'
c06r4n17: Traceback (most recent call last):
c06r4n17: File "src/train_bash.py", line 1, in <module>
c06r4n17: from llmtuner import run_exp
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n17: from .api import create_app
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n17: from .app import create_app
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n17: from ..chat import ChatModel
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n17: from .chat_model import ChatModel
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 8, in <module>
c06r4n17: from ..data import get_template_and_fix_tokenizer
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/__init__.py", line 1, in <module>
c06r4n17: from .loader import get_dataset
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/loader.py", line 10, in <module>
c06r4n17: from .parser import get_dataset_list
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/parser.py", line 7, in <module>
c06r4n17: from ..extras.misc import use_modelscope
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/extras/misc.py", line 6, in <module>
c06r4n17: from peft import PeftModel
c06r4n17: ModuleNotFoundError: No module named 'peft'
c06r4n17: [2024-03-15 10:27:23,658] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 16379
c06r4n17: [2024-03-15 10:27:23,659] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 16380
c06r4n17: [2024-03-15 10:27:23,684] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 16381
c06r4n17: [2024-03-15 10:27:23,708] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 16382
c06r4n17: [2024-03-15 10:27:23,732] [ERROR] [launch.py:321:sigkill_handler] ['/work/home/liangjing/anaconda3/envs/torch2.1/bin/python', '-u', 'src/train_bash.py', '--local_rank=3', '--stage', 'sft', '--do_train', '--template', 'llama2', '--dataset', 'alpaca_gpt4_en,alpaca_gpt4_zh', '--finetuning_type', 'full', '--model_name_or_path', '/work/home/liangjing/.cache/modelscope/hub/skyline2006/llama-7b', '--output_dir', '/work/share/huchen1/liangjj/llama_factory', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '1', '--preprocessing_num_workers', '2', '--lr_scheduler_type', 'cosine', '--logging_steps', '10', '--save_steps', '100', '--eval_steps', '100', '--learning_rate', '5e-5', '--max_grad_norm', '0.5', '--num_train_epochs', '4.0', '--val_size', '0.01', '--evaluation_strategy', 'steps', '--load_best_model_at_end', '--weight_decay', '0.', '--warmup_ratio', '0.03', '--plot_loss', '--fp16', '--save_on_each_node', '--deepspeed', 'deepspeed.json'] exits with return code = 1
pdsh@c06r4n17: c06r4n17: ssh exited with exit code 1
c06r4n18: [2024-03-15 10:27:41,876] [INFO] [launch.py:138:main] 1 NCCL_IB_TIMEOUT=22
c06r4n18: [2024-03-15 10:27:41,876] [INFO] [launch.py:145:main] WORLD INFO DICT: {'c06r4n17': [0, 1, 2, 3], 'c06r4n18': [0, 1, 2, 3]}
c06r4n18: [2024-03-15 10:27:41,876] [INFO] [launch.py:151:main] nnodes=2, num_local_procs=4, node_rank=1
c06r4n18: [2024-03-15 10:27:41,876] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(<class 'list'>, {'c06r4n17': [0, 1, 2, 3], 'c06r4n18': [4, 5, 6, 7]})
c06r4n18: [2024-03-15 10:27:41,876] [INFO] [launch.py:163:main] dist_world_size=8
c06r4n18: [2024-03-15 10:27:41,876] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3
c06r4n18: Traceback (most recent call last):
c06r4n18: File "src/train_bash.py", line 1, in <module>
c06r4n18: Traceback (most recent call last):
c06r4n18: File "src/train_bash.py", line 1, in <module>
c06r4n18: from llmtuner import run_exp
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n18: Traceback (most recent call last):
c06r4n18: Traceback (most recent call last):
c06r4n18: File "src/train_bash.py", line 1, in <module>
c06r4n18: from llmtuner import run_exp
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n18: from .api import create_app
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n18: File "src/train_bash.py", line 1, in <module>
c06r4n18: from .api import create_app
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n18: from llmtuner import run_exp
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n18: from .app import create_app
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n18: from .app import create_app
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n18: from .api import create_app
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n18: from ..chat import ChatModel
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n18: from .app import create_app
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n18: from ..chat import ChatModel
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n18: from .chat_model import ChatModel
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 8, in <module>
c06r4n18: from .chat_model import ChatModel
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 8, in <module>
c06r4n18: from ..chat import ChatModel
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n18: from llmtuner import run_exp
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n18: from .chat_model import ChatModel
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 8, in <module>
c06r4n18: from ..data import get_template_and_fix_tokenizer
c06r4n18: from ..data import get_template_and_fix_tokenizer File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/__init__.py", line 1, in <module>
c06r4n18:
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/__init__.py", line 1, in <module>
c06r4n18: from ..data import get_template_and_fix_tokenizer
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/__init__.py", line 1, in <module>
c06r4n18: from .api import create_app
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n18: from .app import create_app
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n18: from ..chat import ChatModel
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n18: from .chat_model import ChatModel
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 8, in <module>
c06r4n18: from ..data import get_template_and_fix_tokenizer
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/__init__.py", line 1, in <module>
c06r4n18: from .loader import get_dataset
c06r4n18: from .loader import get_dataset File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/loader.py", line 10, in <module>
c06r4n18:
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/loader.py", line 10, in <module>
c06r4n18: from .loader import get_dataset
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/loader.py", line 10, in <module>
c06r4n18: from .loader import get_dataset
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/loader.py", line 10, in <module>
c06r4n18: from .parser import get_dataset_list
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/parser.py", line 7, in <module>
c06r4n18: from .parser import get_dataset_list
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/parser.py", line 7, in <module>
c06r4n18: from .parser import get_dataset_list
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/parser.py", line 7, in <module>
c06r4n18: from .parser import get_dataset_list
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/data/parser.py", line 7, in <module>
c06r4n18: from ..extras.misc import use_modelscope
c06r4n18: from ..extras.misc import use_modelscope File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/extras/misc.py", line 6, in <module>
c06r4n18:
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/extras/misc.py", line 6, in <module>
c06r4n18: from ..extras.misc import use_modelscope
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/extras/misc.py", line 6, in <module>
c06r4n18: from ..extras.misc import use_modelscope
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/extras/misc.py", line 6, in <module>
c06r4n18: from peft import PeftModel
c06r4n18: ModuleNotFoundError: No module named 'peft'
c06r4n18: from peft import PeftModel
c06r4n18: ModuleNotFoundError: No module named 'peft'
c06r4n18: from peft import PeftModel
c06r4n18: ModuleNotFoundError: No module named 'peft'
c06r4n18: from peft import PeftModel
c06r4n18: ModuleNotFoundError: No module named 'peft'
c06r4n18: [2024-03-15 10:28:15,961] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 16721
c06r4n18: [2024-03-15 10:28:16,002] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 16722
c06r4n18: [2024-03-15 10:28:16,040] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 16723
c06r4n18: [2024-03-15 10:28:16,040] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 16724
c06r4n18: [2024-03-15 10:28:16,079] [ERROR] [launch.py:321:sigkill_handler] ['/work/home/liangjing/anaconda3/envs/torch2.1/bin/python', '-u', 'src/train_bash.py', '--local_rank=3', '--stage', 'sft', '--do_train', '--template', 'llama2', '--dataset', 'alpaca_gpt4_en,alpaca_gpt4_zh', '--finetuning_type', 'full', '--model_name_or_path', '/work/home/liangjing/.cache/modelscope/hub/skyline2006/llama-7b', '--output_dir', '/work/share/huchen1/liangjj/llama_factory', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '1', '--preprocessing_num_workers', '2', '--lr_scheduler_type', 'cosine', '--logging_steps', '10', '--save_steps', '100', '--eval_steps', '100', '--learning_rate', '5e-5', '--max_grad_norm', '0.5', '--num_train_epochs', '4.0', '--val_size', '0.01', '--evaluation_strategy', 'steps', '--load_best_model_at_end', '--weight_decay', '0.', '--warmup_ratio', '0.03', '--plot_loss', '--fp16', '--save_on_each_node', '--deepspeed', 'deepspeed.json'] exits with return code = 1
pdsh@c06r4n17: c06r4n18: ssh exited with exit code 1
START TIME: Fri Mar 15 10:33:15 CST 2024
[2024-03-15 10:33:20,407] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-03-15 10:33:25,655] [INFO] [runner.py:463:main] Using IP address of 10.3.6.78 for node c06r4n17
[2024-03-15 10:33:25,657] [INFO] [multinode_runner.py:72:get_cmd] Running on the following workers: c06r4n17,c06r4n18
[2024-03-15 10:33:25,657] [INFO] [runner.py:570:main] cmd = pdsh -S -f 1024 -w c06r4n17,c06r4n18 export UCX_MAX_EAGER_LANES=4; export UCX_MAX_RNDV_LANES=4; export UCX_ZCOPY_THRESH=auto; export UCX_WARN_UNUSED_ENV_VARS=n; export UCX_RNDV_THRESH=auto; export NCCL_IB_TIMEOUT=22; export UCX_IB_PCI_BW=mlx5_0:50Gbs,mlx5_1:50Gbs,mlx5_2:50Gbs,mlx5_3:50Gbs; export UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1; export PYTHONPATH=/work/home/liangjing/LLM/LLaMA-Factory-main; cd /work/home/liangjing/LLM/LLaMA-Factory-main; /work/home/liangjing/anaconda3/envs/torch2.1/bin/python -u -m deepspeed.launcher.launch --world_info=eyJjMDZyNG4xNyI6IFswLCAxLCAyLCAzXSwgImMwNnI0bjE4IjogWzAsIDEsIDIsIDNdfQ== --node_rank=%n --master_addr=10.3.6.78 --master_port=29500 src/train_bash.py --stage 'sft' --do_train --template 'llama2' --dataset 'alpaca_gpt4_en,alpaca_gpt4_zh' --finetuning_type 'full' --model_name_or_path '/work/home/liangjing/.cache/modelscope/hub/skyline2006/llama-7b' --output_dir '/work/share/huchen1/liangjj/llama_factory' --per_device_train_batch_size '1' --per_device_eval_batch_size '1' --gradient_accumulation_steps '1' --preprocessing_num_workers '2' --lr_scheduler_type 'cosine' --logging_steps '10' --save_steps '100' --eval_steps '100' --learning_rate '5e-5' --max_grad_norm '0.5' --num_train_epochs '4.0' --val_size '0.01' --evaluation_strategy 'steps' --load_best_model_at_end --weight_decay '0.' --warmup_ratio '0.03' --plot_loss --fp16 --save_on_each_node --deepspeed 'deepspeed.json'
c06r4n17: [2024-03-15 10:33:31,800] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
c06r4n17: [2024-03-15 10:33:33,251] [INFO] [launch.py:138:main] 0 NCCL_IB_TIMEOUT=22
c06r4n17: [2024-03-15 10:33:33,251] [INFO] [launch.py:145:main] WORLD INFO DICT: {'c06r4n17': [0, 1, 2, 3], 'c06r4n18': [0, 1, 2, 3]}
c06r4n17: [2024-03-15 10:33:33,251] [INFO] [launch.py:151:main] nnodes=2, num_local_procs=4, node_rank=0
c06r4n17: [2024-03-15 10:33:33,252] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(<class 'list'>, {'c06r4n17': [0, 1, 2, 3], 'c06r4n18': [4, 5, 6, 7]})
c06r4n17: [2024-03-15 10:33:33,252] [INFO] [launch.py:163:main] dist_world_size=8
c06r4n17: [2024-03-15 10:33:33,252] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3
c06r4n18: [2024-03-15 10:33:34,520] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
c06r4n18: [2024-03-15 10:33:36,904] [INFO] [launch.py:138:main] 1 NCCL_IB_TIMEOUT=22
c06r4n18: [2024-03-15 10:33:36,904] [INFO] [launch.py:145:main] WORLD INFO DICT: {'c06r4n17': [0, 1, 2, 3], 'c06r4n18': [0, 1, 2, 3]}
c06r4n18: [2024-03-15 10:33:36,904] [INFO] [launch.py:151:main] nnodes=2, num_local_procs=4, node_rank=1
c06r4n18: [2024-03-15 10:33:36,904] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(<class 'list'>, {'c06r4n17': [0, 1, 2, 3], 'c06r4n18': [4, 5, 6, 7]})
c06r4n18: [2024-03-15 10:33:36,904] [INFO] [launch.py:163:main] dist_world_size=8
c06r4n18: [2024-03-15 10:33:36,904] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3
c06r4n17: Traceback (most recent call last):
c06r4n17: File "src/train_bash.py", line 1, in <module>
c06r4n17: from llmtuner import run_exp
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n17: from .api import create_app
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n17: from .app import create_app
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n17: from ..chat import ChatModel
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n17: from .chat_model import ChatModel
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in <module>
c06r4n17: from ..model import dispatch_model, load_model_and_tokenizer
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in <module>
c06r4n17: from .loader import load_model_and_tokenizer
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in <module>
c06r4n17: from trl import AutoModelForCausalLMWithValueHead
c06r4n17: ModuleNotFoundError: No module named 'trl'
c06r4n17: Traceback (most recent call last):
c06r4n17: File "src/train_bash.py", line 1, in <module>
c06r4n17: from llmtuner import run_exp
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n17: from .api import create_app
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n17: from .app import create_app
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n17: from ..chat import ChatModel
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n17: from .chat_model import ChatModel
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in <module>
c06r4n17: from ..model import dispatch_model, load_model_and_tokenizer
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in <module>
c06r4n17: from .loader import load_model_and_tokenizer
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in <module>
c06r4n17: from trl import AutoModelForCausalLMWithValueHead
c06r4n17: ModuleNotFoundError: No module named 'trl'
c06r4n17: Traceback (most recent call last):
c06r4n17: File "src/train_bash.py", line 1, in <module>
c06r4n17: from llmtuner import run_exp
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n17: from .api import create_app
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n17: from .app import create_app
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n17: from ..chat import ChatModel
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n17: from .chat_model import ChatModel
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in <module>
c06r4n17: from ..model import dispatch_model, load_model_and_tokenizer
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in <module>
c06r4n17: from .loader import load_model_and_tokenizer
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in <module>
c06r4n17: from trl import AutoModelForCausalLMWithValueHead
c06r4n17: ModuleNotFoundError: No module named 'trl'
c06r4n17: Traceback (most recent call last):
c06r4n17: File "src/train_bash.py", line 1, in <module>
c06r4n17: from llmtuner import run_exp
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n17: from .api import create_app
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n17: from .app import create_app
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n17: from ..chat import ChatModel
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n17: from .chat_model import ChatModel
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in <module>
c06r4n17: from ..model import dispatch_model, load_model_and_tokenizer
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in <module>
c06r4n17: from .loader import load_model_and_tokenizer
c06r4n17: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in <module>
c06r4n17: from trl import AutoModelForCausalLMWithValueHead
c06r4n17: ModuleNotFoundError: No module named 'trl'
c06r4n18: Traceback (most recent call last):
c06r4n18: File "src/train_bash.py", line 1, in <module>
c06r4n18: Traceback (most recent call last):
c06r4n18: Traceback (most recent call last):
c06r4n18: File "src/train_bash.py", line 1, in <module>
c06r4n18: Traceback (most recent call last):
c06r4n18: File "src/train_bash.py", line 1, in <module>
c06r4n18: from llmtuner import run_exp
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n18: File "src/train_bash.py", line 1, in <module>
c06r4n18: from llmtuner import run_exp
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n18: from .api import create_appfrom llmtuner import run_exp
c06r4n18:
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n18: from .api import create_app
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n18: from .app import create_appfrom .api import create_app
c06r4n18:
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n18: from .app import create_app
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n18: from .app import create_app
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n18: from ..chat import ChatModel
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n18: from ..chat import ChatModel
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n18: from ..chat import ChatModelfrom .chat_model import ChatModel
c06r4n18:
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in <module>
c06r4n18: from .chat_model import ChatModel
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in <module>
c06r4n18: from .chat_model import ChatModel
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in <module>
c06r4n18: from ..model import dispatch_model, load_model_and_tokenizer
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in <module>
c06r4n18: from llmtuner import run_exp from ..model import dispatch_model, load_model_and_tokenizer
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in <module>
c06r4n18:
c06r4n18: from ..model import dispatch_model, load_model_and_tokenizer
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in <module>
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/__init__.py", line 3, in <module>
c06r4n18: from .api import create_app
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/__init__.py", line 1, in <module>
c06r4n18: from .app import create_app
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/api/app.py", line 9, in <module>
c06r4n18: from ..chat import ChatModel
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/__init__.py", line 1, in <module>
c06r4n18: from .chat_model import ChatModel
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/chat/chat_model.py", line 11, in <module>
c06r4n18: from ..model import dispatch_model, load_model_and_tokenizer
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/__init__.py", line 1, in <module>
c06r4n18: from .loader import load_model_and_tokenizer
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in <module>
c06r4n18: from .loader import load_model_and_tokenizer
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in <module>
c06r4n18: from .loader import load_model_and_tokenizer
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in <module>
c06r4n18: from .loader import load_model_and_tokenizer
c06r4n18: File "/work/home/liangjing/LLM/LLaMA-Factory-main/src/llmtuner/model/loader.py", line 5, in <module>
c06r4n18: from trl import AutoModelForCausalLMWithValueHead
c06r4n18: ModuleNotFoundError: No module named 'trl'
c06r4n18: from trl import AutoModelForCausalLMWithValueHead
c06r4n18: ModuleNotFoundError: No module named 'trl'
c06r4n18: from trl import AutoModelForCausalLMWithValueHead
c06r4n18: ModuleNotFoundError: No module named 'trl'
c06r4n18: from trl import AutoModelForCausalLMWithValueHead
c06r4n18: ModuleNotFoundError: No module named 'trl'
c06r4n17: [2024-03-15 10:33:50,318] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21792
c06r4n17: [2024-03-15 10:33:50,356] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21793
c06r4n17: [2024-03-15 10:33:50,380] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21794
c06r4n17: [2024-03-15 10:33:50,380] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21795
c06r4n17: [2024-03-15 10:33:50,404] [ERROR] [launch.py:321:sigkill_handler] ['/work/home/liangjing/anaconda3/envs/torch2.1/bin/python', '-u', 'src/train_bash.py', '--local_rank=3', '--stage', 'sft', '--do_train', '--template', 'llama2', '--dataset', 'alpaca_gpt4_en,alpaca_gpt4_zh', '--finetuning_type', 'full', '--model_name_or_path', '/work/home/liangjing/.cache/modelscope/hub/skyline2006/llama-7b', '--output_dir', '/work/share/huchen1/liangjj/llama_factory', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '1', '--preprocessing_num_workers', '2', '--lr_scheduler_type', 'cosine', '--logging_steps', '10', '--save_steps', '100', '--eval_steps', '100', '--learning_rate', '5e-5', '--max_grad_norm', '0.5', '--num_train_epochs', '4.0', '--val_size', '0.01', '--evaluation_strategy', 'steps', '--load_best_model_at_end', '--weight_decay', '0.', '--warmup_ratio', '0.03', '--plot_loss', '--fp16', '--save_on_each_node', '--deepspeed', 'deepspeed.json'] exits with return code = 1
c06r4n18: [2024-03-15 10:33:52,961] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21238
c06r4n18: [2024-03-15 10:33:52,961] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21239
c06r4n18: [2024-03-15 10:33:53,000] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21240
c06r4n18: [2024-03-15 10:33:53,012] [INFO] [launch.py:315:sigkill_handler] Killing subprocess 21241
c06r4n18: [2024-03-15 10:33:53,051] [ERROR] [launch.py:321:sigkill_handler] ['/work/home/liangjing/anaconda3/envs/torch2.1/bin/python', '-u', 'src/train_bash.py', '--local_rank=3', '--stage', 'sft', '--do_train', '--template', 'llama2', '--dataset', 'alpaca_gpt4_en,alpaca_gpt4_zh', '--finetuning_type', 'full', '--model_name_or_path', '/work/home/liangjing/.cache/modelscope/hub/skyline2006/llama-7b', '--output_dir', '/work/share/huchen1/liangjj/llama_factory', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--gradient_accumulation_steps', '1', '--preprocessing_num_workers', '2', '--lr_scheduler_type', 'cosine', '--logging_steps', '10', '--save_steps', '100', '--eval_steps', '100', '--learning_rate', '5e-5', '--max_grad_norm', '0.5', '--num_train_epochs', '4.0', '--val_size', '0.01', '--evaluation_strategy', 'steps', '--load_best_model_at_end', '--weight_decay', '0.', '--warmup_ratio', '0.03', '--plot_loss', '--fp16', '--save_on_each_node', '--deepspeed', 'deepspeed.json'] exits with return code = 1
pdsh@c06r4n17: c06r4n17: ssh exited with exit code 1
pdsh@c06r4n17: c06r4n18: ssh exited with exit code 1
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
#!/bin/bash
#SBATCH -p xahdnormal
#SBATCH -N 4
#SBATCH --cpus-per-task=8
#SBATCH --ntasks-per-node=4
#SBATCH --gres=dcu:4
#SBATCH -J llama
#SBATCH -t 72:00:00
#SBATCH -w c06r3n[06-09]
#SBATCH -o ./logs/%j.out
#SBATCH -e ./logs/%j.out
echo "START TIME: $(date)"
export NCCL_IB_TIMEOUT=22
ulimit -c 0
export XDG_CACHE_HOME=/work/home/liangjing/.cache
export HF_DATASETS_CACHE=/work/home/liangjing/.cache/huggingface/datasets
hostfile=./hostfile/$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
#shuf ./hostfile/hostfile -o ${hostfile}
#bash ./hostfile/nodelist_reset.sh ${hostfile} ${hostfile}_reset
#for i in `cat ${hostfile}_reset`
for i in `cat ${hostfile}`
do
echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID
done
cp ./hostfile/hostfile-dl-$SLURM_JOB_ID hosts
bash run-full.sh
[build-system]
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"
[tool.black]
line-length = 119
target-version = ["py38"]
[tool.ruff]
line-length = 119
indent-width = 4
[tool.ruff.lint]
ignore = ["C408", "C901", "E501", "E731", "E741", "W605"]
select = ["C", "E", "F", "I", "W"]
[tool.ruff.lint.isort]
lines-after-imports = 2
known-first-party = ["llmtuner"]
[tool.ruff.format]
quote-style = "double"
indent-style = "space"
skip-magic-trailing-comma = false
line-ending = "auto"
[isort]
default_section = "FIRSTPARTY"
known_first_party = "llmtuner"
known_third_party = [
"accelerate",
"datasets",
"gradio",
"numpy",
"peft",
"torch",
"transformers",
"trl"
]
line_length = 119
lines_after_imports = 2
multi_line_output = 3
include_trailing_comma = true
force_grid_wrap = 0
use_parentheses = true
ensure_newline_before_comments = true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment