runner.py 21.4 KB
Newer Older
chenych's avatar
chenych committed
1
# Copyright 2025 the LlamaFactory team.
chenych's avatar
chenych committed
2
3
4
5
6
7
8
9
10
11
12
13
14
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

luopl's avatar
luopl committed
15
import json
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
16
import os
chenych's avatar
chenych committed
17
from collections.abc import Generator
chenych's avatar
chenych committed
18
19
from copy import deepcopy
from subprocess import Popen, TimeoutExpired
chenych's avatar
chenych committed
20
from typing import TYPE_CHECKING, Any, Optional
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
21
22

from transformers.trainer import TRAINING_ARGS_NAME
luopl's avatar
luopl committed
23
from transformers.utils import is_torch_npu_available
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
24

chenych's avatar
chenych committed
25
from ..extras.constants import LLAMABOARD_CONFIG, PEFT_METHODS, TRAINING_STAGES
luopl's avatar
luopl committed
26
from ..extras.misc import is_gpu_or_npu_available, torch_gc, use_ray
chenych's avatar
chenych committed
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from ..extras.packages import is_gradio_available
from .common import (
    DEFAULT_CACHE_DIR,
    DEFAULT_CONFIG_DIR,
    abort_process,
    gen_cmd,
    get_save_dir,
    load_args,
    load_config,
    load_eval_results,
    save_args,
    save_cmd,
)
from .control import get_trainer_info
chenych's avatar
chenych committed
41
from .locales import ALERTS, LOCALES
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
42
43
44
45
46
47
48
49
50
51
52
53
54


if is_gradio_available():
    import gradio as gr


if TYPE_CHECKING:
    from gradio.components import Component

    from .manager import Manager


class Runner:
chenych's avatar
chenych committed
55
    r"""A class to manage the running status of the trainers."""
chenych's avatar
chenych committed
56

Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
57
    def __init__(self, manager: "Manager", demo_mode: bool = False) -> None:
chenych's avatar
chenych committed
58
        r"""Init a runner."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
59
60
61
        self.manager = manager
        self.demo_mode = demo_mode
        """ Resume """
chenych's avatar
chenych committed
62
        self.trainer: Optional[Popen] = None
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
63
        self.do_train = True
chenych's avatar
chenych committed
64
        self.running_data: dict[Component, Any] = None
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
65
66
67
68
69
70
        """ State """
        self.aborted = False
        self.running = False

    def set_abort(self) -> None:
        self.aborted = True
chenych's avatar
chenych committed
71
72
        if self.trainer is not None:
            abort_process(self.trainer.pid)
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
73

chenych's avatar
chenych committed
74
75
    def _initialize(self, data: dict["Component", Any], do_train: bool, from_preview: bool) -> str:
        r"""Validate the configuration."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
76
77
78
79
80
81
82
83
84
85
86
87
88
        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
        lang, model_name, model_path = get("top.lang"), get("top.model_name"), get("top.model_path")
        dataset = get("train.dataset") if do_train else get("eval.dataset")

        if self.running:
            return ALERTS["err_conflict"][lang]

        if not model_name:
            return ALERTS["err_no_model"][lang]

        if not model_path:
            return ALERTS["err_no_path"][lang]

chenych's avatar
chenych committed
89
        if not dataset:
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
90
91
92
93
94
95
            return ALERTS["err_no_dataset"][lang]

        if not from_preview and self.demo_mode:
            return ALERTS["err_demo"][lang]

        if do_train:
chenych's avatar
chenych committed
96
97
98
            if not get("train.output_dir"):
                return ALERTS["err_no_output_dir"][lang]

luopl's avatar
luopl committed
99
100
101
102
103
            try:
                json.loads(get("train.extra_args"))
            except json.JSONDecodeError:
                return ALERTS["err_json_schema"][lang]

Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
104
            stage = TRAINING_STAGES[get("train.training_stage")]
chenych's avatar
chenych committed
105
            if stage == "ppo" and not get("train.reward_model"):
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
106
                return ALERTS["err_no_reward_model"][lang]
chenych's avatar
chenych committed
107
108
109
        else:
            if not get("eval.output_dir"):
                return ALERTS["err_no_output_dir"][lang]
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
110

chenych's avatar
chenych committed
111
        if not from_preview and not is_gpu_or_npu_available():
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
112
113
114
115
116
            gr.Warning(ALERTS["warn_no_cuda"][lang])

        return ""

    def _finalize(self, lang: str, finish_info: str) -> str:
chenych's avatar
chenych committed
117
        r"""Clean the cached memory and resets the runner."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
118
        finish_info = ALERTS["info_aborted"][lang] if self.aborted else finish_info
luopl's avatar
luopl committed
119
        gr.Info(finish_info)
chenych's avatar
chenych committed
120
        self.trainer = None
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
121
122
123
124
125
126
        self.aborted = False
        self.running = False
        self.running_data = None
        torch_gc()
        return finish_info

chenych's avatar
chenych committed
127
128
    def _parse_train_args(self, data: dict["Component", Any]) -> dict[str, Any]:
        r"""Build and validate the training arguments."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
129
        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
chenych's avatar
chenych committed
130
        model_name, finetuning_type = get("top.model_name"), get("top.finetuning_type")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
131
132
133
134
135
136
137
        user_config = load_config()

        args = dict(
            stage=TRAINING_STAGES[get("train.training_stage")],
            do_train=True,
            model_name_or_path=get("top.model_path"),
            cache_dir=user_config.get("cache_dir", None),
chenych's avatar
chenych committed
138
139
            preprocessing_num_workers=16,
            finetuning_type=finetuning_type,
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
140
            template=get("top.template"),
chenych's avatar
chenych committed
141
            rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") != "none" else None,
chenych's avatar
chenych committed
142
            flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
143
            use_unsloth=(get("top.booster") == "unsloth"),
luopl's avatar
luopl committed
144
            enable_liger_kernel=(get("top.booster") == "liger_kernel"),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
145
146
147
148
149
150
151
152
153
154
155
156
157
158
            dataset_dir=get("train.dataset_dir"),
            dataset=",".join(get("train.dataset")),
            cutoff_len=get("train.cutoff_len"),
            learning_rate=float(get("train.learning_rate")),
            num_train_epochs=float(get("train.num_train_epochs")),
            max_samples=int(get("train.max_samples")),
            per_device_train_batch_size=get("train.batch_size"),
            gradient_accumulation_steps=get("train.gradient_accumulation_steps"),
            lr_scheduler_type=get("train.lr_scheduler_type"),
            max_grad_norm=float(get("train.max_grad_norm")),
            logging_steps=get("train.logging_steps"),
            save_steps=get("train.save_steps"),
            warmup_steps=get("train.warmup_steps"),
            neftune_noise_alpha=get("train.neftune_alpha") or None,
chenych's avatar
chenych committed
159
160
161
162
            packing=get("train.packing") or get("train.neat_packing"),
            neat_packing=get("train.neat_packing"),
            train_on_prompt=get("train.train_on_prompt"),
            mask_history=get("train.mask_history"),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
163
164
            resize_vocab=get("train.resize_vocab"),
            use_llama_pro=get("train.use_llama_pro"),
chenych's avatar
chenych committed
165
            report_to=get("train.report_to"),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
166
            use_galore=get("train.use_galore"),
luopl's avatar
luopl committed
167
            use_apollo=get("train.use_apollo"),
chenych's avatar
chenych committed
168
            use_badam=get("train.use_badam"),
luopl's avatar
luopl committed
169
            use_swanlab=get("train.use_swanlab"),
chenych's avatar
chenych committed
170
            output_dir=get_save_dir(model_name, finetuning_type, get("train.output_dir")),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
171
172
173
            fp16=(get("train.compute_type") == "fp16"),
            bf16=(get("train.compute_type") == "bf16"),
            pure_bf16=(get("train.compute_type") == "pure_bf16"),
chenych's avatar
chenych committed
174
            plot_loss=True,
luopl's avatar
luopl committed
175
            trust_remote_code=True,
chenych's avatar
chenych committed
176
            ddp_timeout=180000000,
chenych's avatar
chenych committed
177
            include_num_input_tokens_seen=True,
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
178
        )
luopl's avatar
luopl committed
179
        args.update(json.loads(get("train.extra_args")))
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
180

chenych's avatar
chenych committed
181
182
183
184
185
186
187
188
189
190
        # checkpoints
        if get("top.checkpoint_path"):
            if finetuning_type in PEFT_METHODS:  # list
                args["adapter_name_or_path"] = ",".join(
                    [get_save_dir(model_name, finetuning_type, adapter) for adapter in get("top.checkpoint_path")]
                )
            else:  # str
                args["model_name_or_path"] = get_save_dir(model_name, finetuning_type, get("top.checkpoint_path"))

        # quantization
chenych's avatar
chenych committed
191
        if get("top.quantization_bit") != "none":
chenych's avatar
chenych committed
192
193
            args["quantization_bit"] = int(get("top.quantization_bit"))
            args["quantization_method"] = get("top.quantization_method")
luopl's avatar
luopl committed
194
            args["double_quantization"] = not is_torch_npu_available()
chenych's avatar
chenych committed
195
196

        # freeze config
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
197
        if args["finetuning_type"] == "freeze":
chenych's avatar
chenych committed
198
199
200
201
202
203
            args["freeze_trainable_layers"] = get("train.freeze_trainable_layers")
            args["freeze_trainable_modules"] = get("train.freeze_trainable_modules")
            args["freeze_extra_modules"] = get("train.freeze_extra_modules") or None

        # lora config
        if args["finetuning_type"] == "lora":
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
204
205
206
207
208
209
210
            args["lora_rank"] = get("train.lora_rank")
            args["lora_alpha"] = get("train.lora_alpha")
            args["lora_dropout"] = get("train.lora_dropout")
            args["loraplus_lr_ratio"] = get("train.loraplus_lr_ratio") or None
            args["create_new_adapter"] = get("train.create_new_adapter")
            args["use_rslora"] = get("train.use_rslora")
            args["use_dora"] = get("train.use_dora")
chenych's avatar
chenych committed
211
212
213
            args["pissa_init"] = get("train.use_pissa")
            args["pissa_convert"] = get("train.use_pissa")
            args["lora_target"] = get("train.lora_target") or "all"
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
214
215
216
            args["additional_target"] = get("train.additional_target") or None

            if args["use_llama_pro"]:
chenych's avatar
chenych committed
217
                args["freeze_trainable_layers"] = get("train.freeze_trainable_layers")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
218

chenych's avatar
chenych committed
219
        # rlhf config
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
220
        if args["stage"] == "ppo":
chenych's avatar
chenych committed
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
            if finetuning_type in PEFT_METHODS:
                args["reward_model"] = ",".join(
                    [get_save_dir(model_name, finetuning_type, adapter) for adapter in get("train.reward_model")]
                )
            else:
                args["reward_model"] = get_save_dir(model_name, finetuning_type, get("train.reward_model"))

            args["reward_model_type"] = "lora" if finetuning_type == "lora" else "full"
            args["ppo_score_norm"] = get("train.ppo_score_norm")
            args["ppo_whiten_rewards"] = get("train.ppo_whiten_rewards")
            args["top_k"] = 0
            args["top_p"] = 0.9
        elif args["stage"] in ["dpo", "kto"]:
            args["pref_beta"] = get("train.pref_beta")
            args["pref_ftx"] = get("train.pref_ftx")
            args["pref_loss"] = get("train.pref_loss")

        # galore config
        if args["use_galore"]:
            args["galore_rank"] = get("train.galore_rank")
            args["galore_update_interval"] = get("train.galore_update_interval")
            args["galore_scale"] = get("train.galore_scale")
            args["galore_target"] = get("train.galore_target")

luopl's avatar
luopl committed
245
246
247
248
249
250
251
        # apollo config
        if args["use_apollo"]:
            args["apollo_rank"] = get("train.apollo_rank")
            args["apollo_update_interval"] = get("train.apollo_update_interval")
            args["apollo_scale"] = get("train.apollo_scale")
            args["apollo_target"] = get("train.apollo_target")

chenych's avatar
chenych committed
252
253
254
255
256
257
        # badam config
        if args["use_badam"]:
            args["badam_mode"] = get("train.badam_mode")
            args["badam_switch_mode"] = get("train.badam_switch_mode")
            args["badam_switch_interval"] = get("train.badam_switch_interval")
            args["badam_update_ratio"] = get("train.badam_update_ratio")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
258

chenych's avatar
chenych committed
259
260
261
262
263
264
        # report_to
        if "none" in args["report_to"]:
            args["report_to"] = "none"
        elif "all" in args["report_to"]:
            args["report_to"] = "all"

luopl's avatar
luopl committed
265
266
267
268
269
270
271
272
        # swanlab config
        if get("train.use_swanlab"):
            args["swanlab_project"] = get("train.swanlab_project")
            args["swanlab_run_name"] = get("train.swanlab_run_name")
            args["swanlab_workspace"] = get("train.swanlab_workspace")
            args["swanlab_api_key"] = get("train.swanlab_api_key")
            args["swanlab_mode"] = get("train.swanlab_mode")

chenych's avatar
chenych committed
273
        # eval config
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
274
275
        if get("train.val_size") > 1e-6 and args["stage"] != "ppo":
            args["val_size"] = get("train.val_size")
chenych's avatar
chenych committed
276
            args["eval_strategy"] = "steps"
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
277
278
279
            args["eval_steps"] = args["save_steps"]
            args["per_device_eval_batch_size"] = args["per_device_train_batch_size"]

chenych's avatar
chenych committed
280
281
282
283
        # ds config
        if get("train.ds_stage") != "none":
            ds_stage = get("train.ds_stage")
            ds_offload = "offload_" if get("train.ds_offload") else ""
luopl's avatar
luopl committed
284
            args["deepspeed"] = os.path.join(DEFAULT_CACHE_DIR, f"ds_z{ds_stage}_{ds_offload}config.json")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
285
286
287

        return args

chenych's avatar
chenych committed
288
289
    def _parse_eval_args(self, data: dict["Component", Any]) -> dict[str, Any]:
        r"""Build and validate the evaluation arguments."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
290
        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
chenych's avatar
chenych committed
291
        model_name, finetuning_type = get("top.model_name"), get("top.finetuning_type")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
292
293
294
295
296
297
        user_config = load_config()

        args = dict(
            stage="sft",
            model_name_or_path=get("top.model_path"),
            cache_dir=user_config.get("cache_dir", None),
chenych's avatar
chenych committed
298
299
300
            preprocessing_num_workers=16,
            finetuning_type=finetuning_type,
            quantization_method=get("top.quantization_method"),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
301
            template=get("top.template"),
chenych's avatar
chenych committed
302
            rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") != "none" else None,
chenych's avatar
chenych committed
303
            flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
304
305
            use_unsloth=(get("top.booster") == "unsloth"),
            dataset_dir=get("eval.dataset_dir"),
chenych's avatar
chenych committed
306
            eval_dataset=",".join(get("eval.dataset")),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
307
308
309
310
311
312
313
            cutoff_len=get("eval.cutoff_len"),
            max_samples=int(get("eval.max_samples")),
            per_device_eval_batch_size=get("eval.batch_size"),
            predict_with_generate=True,
            max_new_tokens=get("eval.max_new_tokens"),
            top_p=get("eval.top_p"),
            temperature=get("eval.temperature"),
chenych's avatar
chenych committed
314
            output_dir=get_save_dir(model_name, finetuning_type, get("eval.output_dir")),
luopl's avatar
luopl committed
315
            trust_remote_code=True,
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
316
317
318
319
320
321
322
        )

        if get("eval.predict"):
            args["do_predict"] = True
        else:
            args["do_eval"] = True

chenych's avatar
chenych committed
323
324
325
326
327
328
329
330
331
332
        # checkpoints
        if get("top.checkpoint_path"):
            if finetuning_type in PEFT_METHODS:  # list
                args["adapter_name_or_path"] = ",".join(
                    [get_save_dir(model_name, finetuning_type, adapter) for adapter in get("top.checkpoint_path")]
                )
            else:  # str
                args["model_name_or_path"] = get_save_dir(model_name, finetuning_type, get("top.checkpoint_path"))

        # quantization
chenych's avatar
chenych committed
333
        if get("top.quantization_bit") != "none":
chenych's avatar
chenych committed
334
335
            args["quantization_bit"] = int(get("top.quantization_bit"))
            args["quantization_method"] = get("top.quantization_method")
chenych's avatar
chenych committed
336
            args["double_quantization"] = not is_torch_npu_available()
chenych's avatar
chenych committed
337

Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
338
339
        return args

chenych's avatar
chenych committed
340
341
    def _preview(self, data: dict["Component", Any], do_train: bool) -> Generator[dict["Component", str], None, None]:
        r"""Preview the training commands."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
342
343
344
345
346
347
348
349
350
        output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if do_train else "eval"))
        error = self._initialize(data, do_train, from_preview=True)
        if error:
            gr.Warning(error)
            yield {output_box: error}
        else:
            args = self._parse_train_args(data) if do_train else self._parse_eval_args(data)
            yield {output_box: gen_cmd(args)}

chenych's avatar
chenych committed
351
352
    def _launch(self, data: dict["Component", Any], do_train: bool) -> Generator[dict["Component", Any], None, None]:
        r"""Start the training process."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
353
354
355
356
357
358
359
        output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if do_train else "eval"))
        error = self._initialize(data, do_train, from_preview=False)
        if error:
            gr.Warning(error)
            yield {output_box: error}
        else:
            self.do_train, self.running_data = do_train, data
chenych's avatar
chenych committed
360
361
362
            args = self._parse_train_args(data) if do_train else self._parse_eval_args(data)

            os.makedirs(args["output_dir"], exist_ok=True)
chenych's avatar
chenych committed
363
            save_args(os.path.join(args["output_dir"], LLAMABOARD_CONFIG), self._build_config_dict(data))
chenych's avatar
chenych committed
364
365
366
367
368
369
370

            env = deepcopy(os.environ)
            env["LLAMABOARD_ENABLED"] = "1"
            env["LLAMABOARD_WORKDIR"] = args["output_dir"]
            if args.get("deepspeed", None) is not None:
                env["FORCE_TORCHRUN"] = "1"

luopl's avatar
luopl committed
371
            self.trainer = Popen(["llamafactory-cli", "train", save_cmd(args)], env=env)
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
372
373
            yield from self.monitor()

chenych's avatar
chenych committed
374
375
    def _build_config_dict(self, data: dict["Component", Any]) -> dict[str, Any]:
        r"""Build a dictionary containing the current training configuration."""
chenych's avatar
chenych committed
376
377
378
379
380
381
382
383
384
        config_dict = {}
        skip_ids = ["top.lang", "top.model_path", "train.output_dir", "train.config_path"]
        for elem, value in data.items():
            elem_id = self.manager.get_id_by_elem(elem)
            if elem_id not in skip_ids:
                config_dict[elem_id] = value

        return config_dict

Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
385
386
387
388
389
390
391
392
393
394
395
396
397
    def preview_train(self, data):
        yield from self._preview(data, do_train=True)

    def preview_eval(self, data):
        yield from self._preview(data, do_train=False)

    def run_train(self, data):
        yield from self._launch(data, do_train=True)

    def run_eval(self, data):
        yield from self._launch(data, do_train=False)

    def monitor(self):
chenych's avatar
chenych committed
398
        r"""Monitorgit the training progress and logs."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
399
400
401
        self.aborted = False
        self.running = True

chenych's avatar
chenych committed
402
403
        get = lambda elem_id: self.running_data[self.manager.get_elem_by_id(elem_id)]
        lang, model_name, finetuning_type = get("top.lang"), get("top.model_name"), get("top.finetuning_type")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
404
405
406
407
        output_dir = get("{}.output_dir".format("train" if self.do_train else "eval"))
        output_path = get_save_dir(model_name, finetuning_type, output_dir)

        output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if self.do_train else "eval"))
chenych's avatar
chenych committed
408
        progress_bar = self.manager.get_elem_by_id("{}.progress_bar".format("train" if self.do_train else "eval"))
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
409
        loss_viewer = self.manager.get_elem_by_id("train.loss_viewer") if self.do_train else None
chenych's avatar
chenych committed
410
        swanlab_link = self.manager.get_elem_by_id("train.swanlab_link") if self.do_train else None
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
411

luopl's avatar
luopl committed
412
        running_log = ""
chenych's avatar
chenych committed
413
        while self.trainer is not None:
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
414
415
416
            if self.aborted:
                yield {
                    output_box: ALERTS["info_aborting"][lang],
chenych's avatar
chenych committed
417
                    progress_bar: gr.Slider(visible=False),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
418
419
                }
            else:
chenych's avatar
chenych committed
420
                running_log, running_progress, running_info = get_trainer_info(lang, output_path, self.do_train)
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
421
                return_dict = {
chenych's avatar
chenych committed
422
423
                    output_box: running_log,
                    progress_bar: running_progress,
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
424
                }
chenych's avatar
chenych committed
425
426
                if "loss_viewer" in running_info:
                    return_dict[loss_viewer] = running_info["loss_viewer"]
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
427

chenych's avatar
chenych committed
428
429
                if "swanlab_link" in running_info:
                    return_dict[swanlab_link] = running_info["swanlab_link"]
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
430

chenych's avatar
chenych committed
431
                yield return_dict
chenych's avatar
chenych committed
432
433
434
435
436
            try:
                self.trainer.wait(2)
                self.trainer = None
            except TimeoutExpired:
                continue
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
437
438

        if self.do_train:
luopl's avatar
luopl committed
439
            if os.path.exists(os.path.join(output_path, TRAINING_ARGS_NAME)) or use_ray():
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
440
441
442
443
                finish_info = ALERTS["info_finished"][lang]
            else:
                finish_info = ALERTS["err_failed"][lang]
        else:
luopl's avatar
luopl committed
444
            if os.path.exists(os.path.join(output_path, "all_results.json")) or use_ray():
chenych's avatar
chenych committed
445
                finish_info = load_eval_results(os.path.join(output_path, "all_results.json"))
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
446
447
448
449
            else:
                finish_info = ALERTS["err_failed"][lang]

        return_dict = {
luopl's avatar
luopl committed
450
            output_box: self._finalize(lang, finish_info) + "\n\n" + running_log,
chenych's avatar
chenych committed
451
            progress_bar: gr.Slider(visible=False),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
452
453
454
455
        }
        yield return_dict

    def save_args(self, data):
chenych's avatar
chenych committed
456
        r"""Save the training configuration to config path."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
457
458
459
460
461
462
463
464
        output_box = self.manager.get_elem_by_id("train.output_box")
        error = self._initialize(data, do_train=True, from_preview=True)
        if error:
            gr.Warning(error)
            return {output_box: error}

        lang = data[self.manager.get_elem_by_id("top.lang")]
        config_path = data[self.manager.get_elem_by_id("train.config_path")]
chenych's avatar
chenych committed
465
466
        os.makedirs(DEFAULT_CONFIG_DIR, exist_ok=True)
        save_path = os.path.join(DEFAULT_CONFIG_DIR, config_path)
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
467

chenych's avatar
chenych committed
468
        save_args(save_path, self._build_config_dict(data))
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
469
470
471
        return {output_box: ALERTS["info_config_saved"][lang] + save_path}

    def load_args(self, lang: str, config_path: str):
chenych's avatar
chenych committed
472
        r"""Load the training configuration from config path."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
473
        output_box = self.manager.get_elem_by_id("train.output_box")
chenych's avatar
chenych committed
474
        config_dict = load_args(os.path.join(DEFAULT_CONFIG_DIR, config_path))
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
475
476
477
478
        if config_dict is None:
            gr.Warning(ALERTS["err_config_not_found"][lang])
            return {output_box: ALERTS["err_config_not_found"][lang]}

chenych's avatar
chenych committed
479
        output_dict: dict[Component, Any] = {output_box: ALERTS["info_config_loaded"][lang]}
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
480
481
482
483
        for elem_id, value in config_dict.items():
            output_dict[self.manager.get_elem_by_id(elem_id)] = value

        return output_dict
chenych's avatar
chenych committed
484
485

    def check_output_dir(self, lang: str, model_name: str, finetuning_type: str, output_dir: str):
chenych's avatar
chenych committed
486
        r"""Restore the training status if output_dir exists."""
chenych's avatar
chenych committed
487
        output_box = self.manager.get_elem_by_id("train.output_box")
chenych's avatar
chenych committed
488
        output_dict: dict[Component, Any] = {output_box: LOCALES["output_box"][lang]["value"]}
chenych's avatar
chenych committed
489
490
491
492
493
494
495
496
497
498
        if model_name and output_dir and os.path.isdir(get_save_dir(model_name, finetuning_type, output_dir)):
            gr.Warning(ALERTS["warn_output_dir_exists"][lang])
            output_dict[output_box] = ALERTS["warn_output_dir_exists"][lang]

            output_dir = get_save_dir(model_name, finetuning_type, output_dir)
            config_dict = load_args(os.path.join(output_dir, LLAMABOARD_CONFIG))  # load llamaboard config
            for elem_id, value in config_dict.items():
                output_dict[self.manager.get_elem_by_id(elem_id)] = value

        return output_dict