parser.py 17.9 KB
Newer Older
chenych's avatar
chenych committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
#
# This code is inspired by the HuggingFace's transformers library.
# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
18
19
20
21
22
23
24
25
import logging
import os
import sys
from typing import Any, Dict, Optional, Tuple

import torch
import transformers
from transformers import HfArgumentParser, Seq2SeqTrainingArguments
chenych's avatar
chenych committed
26
from transformers.integrations import is_deepspeed_zero3_enabled
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
27
from transformers.trainer_utils import get_last_checkpoint
chenych's avatar
chenych committed
28
from transformers.training_args import ParallelMode
luopl's avatar
luopl committed
29
from transformers.utils import is_torch_bf16_gpu_available, is_torch_npu_available
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
30
31
from transformers.utils.versions import require_version

chenych's avatar
chenych committed
32
from ..extras.constants import CHECKPOINT_NAMES
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from ..extras.logging import get_logger
from ..extras.misc import check_dependencies, get_current_device
from .data_args import DataArguments
from .evaluation_args import EvaluationArguments
from .finetuning_args import FinetuningArguments
from .generating_args import GeneratingArguments
from .model_args import ModelArguments


logger = get_logger(__name__)


check_dependencies()


_TRAIN_ARGS = [ModelArguments, DataArguments, Seq2SeqTrainingArguments, FinetuningArguments, GeneratingArguments]
_TRAIN_CLS = Tuple[ModelArguments, DataArguments, Seq2SeqTrainingArguments, FinetuningArguments, GeneratingArguments]
_INFER_ARGS = [ModelArguments, DataArguments, FinetuningArguments, GeneratingArguments]
_INFER_CLS = Tuple[ModelArguments, DataArguments, FinetuningArguments, GeneratingArguments]
_EVAL_ARGS = [ModelArguments, DataArguments, EvaluationArguments, FinetuningArguments]
_EVAL_CLS = Tuple[ModelArguments, DataArguments, EvaluationArguments, FinetuningArguments]


def _parse_args(parser: "HfArgumentParser", args: Optional[Dict[str, Any]] = None) -> Tuple[Any]:
    if args is not None:
        return parser.parse_dict(args)

luopl's avatar
luopl committed
60
    if len(sys.argv) == 2 and (sys.argv[1].endswith(".yaml") or sys.argv[1].endswith(".yml")):
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
        return parser.parse_yaml_file(os.path.abspath(sys.argv[1]))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        return parser.parse_json_file(os.path.abspath(sys.argv[1]))

    (*parsed_args, unknown_args) = parser.parse_args_into_dataclasses(return_remaining_strings=True)

    if unknown_args:
        print(parser.format_help())
        print("Got unknown args, potentially deprecated arguments: {}".format(unknown_args))
        raise ValueError("Some specified arguments are not used by the HfArgumentParser: {}".format(unknown_args))

    return (*parsed_args,)


def _set_transformers_logging(log_level: Optional[int] = logging.INFO) -> None:
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()


chenych's avatar
chenych committed
82
83
84
85
86
def _verify_model_args(
    model_args: "ModelArguments",
    data_args: "DataArguments",
    finetuning_args: "FinetuningArguments",
) -> None:
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
87
88
89
90
91
92
93
    if model_args.adapter_name_or_path is not None and finetuning_args.finetuning_type != "lora":
        raise ValueError("Adapter is only valid for the LoRA method.")

    if model_args.quantization_bit is not None:
        if finetuning_args.finetuning_type != "lora":
            raise ValueError("Quantization is only compatible with the LoRA method.")

chenych's avatar
chenych committed
94
95
96
97
98
99
        if finetuning_args.pissa_init:
            raise ValueError("Please use scripts/pissa_init.py to initialize PiSSA for a quantized model.")

        if model_args.resize_vocab:
            raise ValueError("Cannot resize embedding layers of a quantized model.")

Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
100
101
102
103
104
105
        if model_args.adapter_name_or_path is not None and finetuning_args.create_new_adapter:
            raise ValueError("Cannot create new adapter upon a quantized model.")

        if model_args.adapter_name_or_path is not None and len(model_args.adapter_name_or_path) != 1:
            raise ValueError("Quantized model only accepts a single adapter. Merge them first.")

chenych's avatar
chenych committed
106
107
108
109
    if data_args.template == "yi" and model_args.use_fast_tokenizer:
        logger.warning("We should use slow tokenizer for the Yi models. Change `use_fast_tokenizer` to False.")
        model_args.use_fast_tokenizer = False

Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
110
111
112
113
114
115
116
117
118

def _check_extra_dependencies(
    model_args: "ModelArguments",
    finetuning_args: "FinetuningArguments",
    training_args: Optional["Seq2SeqTrainingArguments"] = None,
) -> None:
    if model_args.use_unsloth:
        require_version("unsloth", "Please install unsloth: https://github.com/unslothai/unsloth")

luopl's avatar
luopl committed
119
120
121
    if model_args.enable_liger_kernel:
        require_version("liger-kernel", "To fix: pip install liger-kernel")

Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
122
123
124
125
    if model_args.mixture_of_depths is not None:
        require_version("mixture-of-depth>=1.1.6", "To fix: pip install mixture-of-depth>=1.1.6")

    if model_args.infer_backend == "vllm":
luopl's avatar
luopl committed
126
        require_version("vllm>=0.4.3,<=0.6.2", "To fix: pip install vllm>=0.4.3,<=0.6.2")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
127
128
129
130
131

    if finetuning_args.use_galore:
        require_version("galore_torch", "To fix: pip install galore_torch")

    if finetuning_args.use_badam:
chenych's avatar
chenych committed
132
133
134
135
136
137
138
        require_version("badam>=1.2.1", "To fix: pip install badam>=1.2.1")

    if finetuning_args.use_adam_mini:
        require_version("adam-mini", "To fix: pip install adam-mini")

    if finetuning_args.plot_loss:
        require_version("matplotlib", "To fix: pip install matplotlib")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171

    if training_args is not None and training_args.predict_with_generate:
        require_version("jieba", "To fix: pip install jieba")
        require_version("nltk", "To fix: pip install nltk")
        require_version("rouge_chinese", "To fix: pip install rouge-chinese")


def _parse_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
    parser = HfArgumentParser(_TRAIN_ARGS)
    return _parse_args(parser, args)


def _parse_infer_args(args: Optional[Dict[str, Any]] = None) -> _INFER_CLS:
    parser = HfArgumentParser(_INFER_ARGS)
    return _parse_args(parser, args)


def _parse_eval_args(args: Optional[Dict[str, Any]] = None) -> _EVAL_CLS:
    parser = HfArgumentParser(_EVAL_ARGS)
    return _parse_args(parser, args)


def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
    model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)

    # Setup logging
    if training_args.should_log:
        _set_transformers_logging()

    # Check arguments
    if finetuning_args.stage != "pt" and data_args.template is None:
        raise ValueError("Please specify which `template` to use.")

chenych's avatar
chenych committed
172
173
174
175
176
177
178
179
180
    if finetuning_args.stage != "sft":
        if training_args.predict_with_generate:
            raise ValueError("`predict_with_generate` cannot be set as True except SFT.")

        if data_args.neat_packing:
            raise ValueError("`neat_packing` cannot be set as True except SFT.")

        if data_args.train_on_prompt or data_args.mask_history:
            raise ValueError("`train_on_prompt` or `mask_history` cannot be set as True except SFT.")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
181
182
183
184
185
186
187

    if finetuning_args.stage == "sft" and training_args.do_predict and not training_args.predict_with_generate:
        raise ValueError("Please enable `predict_with_generate` to save model predictions.")

    if finetuning_args.stage in ["rm", "ppo"] and training_args.load_best_model_at_end:
        raise ValueError("RM and PPO stages do not support `load_best_model_at_end`.")

chenych's avatar
chenych committed
188
189
190
    if finetuning_args.stage == "ppo":
        if not training_args.do_train:
            raise ValueError("PPO training does not support evaluation, use the SFT stage to evaluate models.")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
191

chenych's avatar
chenych committed
192
193
        if model_args.shift_attn:
            raise ValueError("PPO training is incompatible with S^2-Attn.")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
194

chenych's avatar
chenych committed
195
196
        if finetuning_args.reward_model_type == "lora" and model_args.use_unsloth:
            raise ValueError("Unsloth does not support lora reward model.")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
197

chenych's avatar
chenych committed
198
199
200
201
202
203
204
205
        if training_args.report_to and training_args.report_to[0] not in ["wandb", "tensorboard"]:
            raise ValueError("PPO only accepts wandb or tensorboard logger.")

    if training_args.parallel_mode == ParallelMode.NOT_DISTRIBUTED:
        raise ValueError("Please launch distributed training with `llamafactory-cli` or `torchrun`.")

    if training_args.deepspeed and training_args.parallel_mode != ParallelMode.DISTRIBUTED:
        raise ValueError("Please use `FORCE_TORCHRUN=1` to launch DeepSpeed training.")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
206
207
208
209

    if training_args.max_steps == -1 and data_args.streaming:
        raise ValueError("Please specify `max_steps` in streaming mode.")

chenych's avatar
chenych committed
210
211
212
213
214
215
216
217
    if training_args.do_train and data_args.dataset is None:
        raise ValueError("Please specify dataset for training.")

    if (training_args.do_eval or training_args.do_predict) and (
        data_args.eval_dataset is None and data_args.val_size < 1e-6
    ):
        raise ValueError("Please specify dataset for evaluation.")

luopl's avatar
luopl committed
218
219
220
221
222
223
    if training_args.predict_with_generate:
        if is_deepspeed_zero3_enabled():
            raise ValueError("`predict_with_generate` is incompatible with DeepSpeed ZeRO-3.")

        if data_args.eval_dataset is None:
            raise ValueError("Cannot use `predict_with_generate` if `eval_dataset` is None.")
chenych's avatar
chenych committed
224

luopl's avatar
luopl committed
225
226
        if finetuning_args.compute_accuracy:
            raise ValueError("Cannot use `predict_with_generate` and `compute_accuracy` together.")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
227
228
229
230

    if training_args.do_train and model_args.quantization_device_map == "auto":
        raise ValueError("Cannot use device map for quantized models in training.")

chenych's avatar
chenych committed
231
232
    if finetuning_args.pissa_init and is_deepspeed_zero3_enabled():
        raise ValueError("Please use scripts/pissa_init.py to initialize PiSSA in DeepSpeed ZeRO-3.")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
233
234

    if finetuning_args.pure_bf16:
luopl's avatar
luopl committed
235
        if not (is_torch_bf16_gpu_available() or (is_torch_npu_available() and torch.npu.is_bf16_supported())):
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
236
237
            raise ValueError("This device does not support `pure_bf16`.")

chenych's avatar
chenych committed
238
239
        if is_deepspeed_zero3_enabled():
            raise ValueError("`pure_bf16` is incompatible with DeepSpeed ZeRO-3.")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
240
241
242
243

    if (
        finetuning_args.use_galore
        and finetuning_args.galore_layerwise
chenych's avatar
chenych committed
244
        and training_args.parallel_mode == ParallelMode.DISTRIBUTED
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
245
246
247
    ):
        raise ValueError("Distributed training does not support layer-wise GaLore.")

chenych's avatar
chenych committed
248
249
250
251
252
    if finetuning_args.use_badam and training_args.parallel_mode == ParallelMode.DISTRIBUTED:
        if finetuning_args.badam_mode == "ratio":
            raise ValueError("Radio-based BAdam does not yet support distributed training, use layer-wise BAdam.")
        elif not is_deepspeed_zero3_enabled():
            raise ValueError("Layer-wise BAdam only supports DeepSpeed ZeRO-3 training.")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
253

chenych's avatar
chenych committed
254
255
    if finetuning_args.use_galore and training_args.deepspeed is not None:
        raise ValueError("GaLore is incompatible with DeepSpeed yet.")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
256
257
258
259

    if model_args.infer_backend == "vllm":
        raise ValueError("vLLM backend is only available for API, CLI and Web.")

chenych's avatar
chenych committed
260
261
262
263
264
265
266
267
    if model_args.use_unsloth and is_deepspeed_zero3_enabled():
        raise ValueError("Unsloth is incompatible with DeepSpeed ZeRO-3.")

    if data_args.neat_packing and not data_args.packing:
        logger.warning("`neat_packing` requires `packing` is True. Change `packing` to True.")
        data_args.packing = True

    _verify_model_args(model_args, data_args, finetuning_args)
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
268
269
270
271
272
    _check_extra_dependencies(model_args, finetuning_args, training_args)

    if (
        training_args.do_train
        and finetuning_args.finetuning_type == "lora"
chenych's avatar
chenych committed
273
        and model_args.quantization_bit is None
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
274
275
276
        and model_args.resize_vocab
        and finetuning_args.additional_target is None
    ):
chenych's avatar
chenych committed
277
        logger.warning("Remember to add embedding layers to `additional_target` to make the added tokens trainable.")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295

    if training_args.do_train and model_args.quantization_bit is not None and (not model_args.upcast_layernorm):
        logger.warning("We recommend enable `upcast_layernorm` in quantized training.")

    if training_args.do_train and (not training_args.fp16) and (not training_args.bf16):
        logger.warning("We recommend enable mixed precision training.")

    if training_args.do_train and finetuning_args.use_galore and not finetuning_args.pure_bf16:
        logger.warning("Using GaLore with mixed precision training may significantly increases GPU memory usage.")

    if (not training_args.do_train) and model_args.quantization_bit is not None:
        logger.warning("Evaluating model in 4/8-bit mode may cause lower scores.")

    if (not training_args.do_train) and finetuning_args.stage == "dpo" and finetuning_args.ref_model is None:
        logger.warning("Specify `ref_model` for computing rewards at evaluation.")

    # Post-process training arguments
    if (
chenych's avatar
chenych committed
296
        training_args.parallel_mode == ParallelMode.DISTRIBUTED
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
        and training_args.ddp_find_unused_parameters is None
        and finetuning_args.finetuning_type == "lora"
    ):
        logger.warning("`ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training.")
        training_args.ddp_find_unused_parameters = False

    if finetuning_args.stage in ["rm", "ppo"] and finetuning_args.finetuning_type in ["full", "freeze"]:
        can_resume_from_checkpoint = False
        if training_args.resume_from_checkpoint is not None:
            logger.warning("Cannot resume from checkpoint in current stage.")
            training_args.resume_from_checkpoint = None
    else:
        can_resume_from_checkpoint = True

    if (
        training_args.resume_from_checkpoint is None
        and training_args.do_train
        and os.path.isdir(training_args.output_dir)
        and not training_args.overwrite_output_dir
        and can_resume_from_checkpoint
    ):
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
chenych's avatar
chenych committed
319
320
321
        if last_checkpoint is None and any(
            os.path.isfile(os.path.join(training_args.output_dir, name)) for name in CHECKPOINT_NAMES
        ):
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
322
323
324
325
            raise ValueError("Output directory already exists and is not empty. Please set `overwrite_output_dir`.")

        if last_checkpoint is not None:
            training_args.resume_from_checkpoint = last_checkpoint
chenych's avatar
chenych committed
326
327
            logger.info("Resuming training from {}.".format(training_args.resume_from_checkpoint))
            logger.info("Change `output_dir` or use `overwrite_output_dir` to avoid.")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347

    if (
        finetuning_args.stage in ["rm", "ppo"]
        and finetuning_args.finetuning_type == "lora"
        and training_args.resume_from_checkpoint is not None
    ):
        logger.warning(
            "Add {} to `adapter_name_or_path` to resume training from checkpoint.".format(
                training_args.resume_from_checkpoint
            )
        )

    # Post-process model arguments
    if training_args.bf16 or finetuning_args.pure_bf16:
        model_args.compute_dtype = torch.bfloat16
    elif training_args.fp16:
        model_args.compute_dtype = torch.float16

    model_args.device_map = {"": get_current_device()}
    model_args.model_max_length = data_args.cutoff_len
chenych's avatar
chenych committed
348
    model_args.block_diag_attn = data_args.neat_packing
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
349
350
    data_args.packing = data_args.packing if data_args.packing is not None else finetuning_args.stage == "pt"

chenych's avatar
chenych committed
351
    # Log on each process the small summary
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
352
353
354
355
356
    logger.info(
        "Process rank: {}, device: {}, n_gpu: {}, distributed training: {}, compute dtype: {}".format(
            training_args.local_rank,
            training_args.device,
            training_args.n_gpu,
chenych's avatar
chenych committed
357
            training_args.parallel_mode == ParallelMode.DISTRIBUTED,
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
            str(model_args.compute_dtype),
        )
    )

    transformers.set_seed(training_args.seed)

    return model_args, data_args, training_args, finetuning_args, generating_args


def get_infer_args(args: Optional[Dict[str, Any]] = None) -> _INFER_CLS:
    model_args, data_args, finetuning_args, generating_args = _parse_infer_args(args)

    _set_transformers_logging()

    if data_args.template is None:
        raise ValueError("Please specify which `template` to use.")

    if model_args.infer_backend == "vllm":
        if finetuning_args.stage != "sft":
            raise ValueError("vLLM engine only supports auto-regressive models.")

        if model_args.quantization_bit is not None:
chenych's avatar
chenych committed
380
            raise ValueError("vLLM engine does not support bnb quantization (GPTQ and AWQ are supported).")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
381
382
383
384

        if model_args.rope_scaling is not None:
            raise ValueError("vLLM engine does not support RoPE scaling.")

chenych's avatar
chenych committed
385
386
387
388
        if model_args.adapter_name_or_path is not None and len(model_args.adapter_name_or_path) != 1:
            raise ValueError("vLLM only accepts a single adapter. Merge them first.")

    _verify_model_args(model_args, data_args, finetuning_args)
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
389
390
    _check_extra_dependencies(model_args, finetuning_args)

chenych's avatar
chenych committed
391
392
393
    if model_args.export_dir is not None and model_args.export_device == "cpu":
        model_args.device_map = {"": torch.device("cpu")}
        model_args.model_max_length = data_args.cutoff_len
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
    else:
        model_args.device_map = "auto"

    return model_args, data_args, finetuning_args, generating_args


def get_eval_args(args: Optional[Dict[str, Any]] = None) -> _EVAL_CLS:
    model_args, data_args, eval_args, finetuning_args = _parse_eval_args(args)

    _set_transformers_logging()

    if data_args.template is None:
        raise ValueError("Please specify which `template` to use.")

    if model_args.infer_backend == "vllm":
        raise ValueError("vLLM backend is only available for API, CLI and Web.")

chenych's avatar
chenych committed
411
    _verify_model_args(model_args, data_args, finetuning_args)
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
412
413
414
415
416
417
418
    _check_extra_dependencies(model_args, finetuning_args)

    model_args.device_map = "auto"

    transformers.set_seed(eval_args.seed)

    return model_args, data_args, eval_args, finetuning_args