runner.py 22 KB
Newer Older
chenych's avatar
chenych committed
1
# Copyright 2025 the LlamaFactory team.
chenych's avatar
chenych committed
2
3
4
5
6
7
8
9
10
11
12
13
14
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

luopl's avatar
luopl committed
15
import json
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
16
import os
chenych's avatar
chenych committed
17
from collections.abc import Generator
chenych's avatar
chenych committed
18
19
from copy import deepcopy
from subprocess import Popen, TimeoutExpired
chenych's avatar
chenych committed
20
from typing import TYPE_CHECKING, Any, Optional
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
21
22

from transformers.trainer import TRAINING_ARGS_NAME
luopl's avatar
luopl committed
23
from transformers.utils import is_torch_npu_available
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
24

chenych's avatar
chenych committed
25
from ..extras.constants import LLAMABOARD_CONFIG, MULTIMODAL_SUPPORTED_MODELS, PEFT_METHODS, TRAINING_STAGES
chenych's avatar
chenych committed
26
from ..extras.misc import is_accelerator_available, torch_gc, use_ray
chenych's avatar
chenych committed
27
28
29
30
31
from ..extras.packages import is_gradio_available
from .common import (
    DEFAULT_CACHE_DIR,
    DEFAULT_CONFIG_DIR,
    abort_process,
chenych's avatar
chenych committed
32
    calculate_pixels,
chenych's avatar
chenych committed
33
34
35
36
37
38
39
40
41
    gen_cmd,
    get_save_dir,
    load_args,
    load_config,
    load_eval_results,
    save_args,
    save_cmd,
)
from .control import get_trainer_info
chenych's avatar
chenych committed
42
from .locales import ALERTS, LOCALES
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
43
44
45
46
47
48
49
50
51
52
53
54
55


if is_gradio_available():
    import gradio as gr


if TYPE_CHECKING:
    from gradio.components import Component

    from .manager import Manager


class Runner:
chenych's avatar
chenych committed
56
    r"""A class to manage the running status of the trainers."""
chenych's avatar
chenych committed
57

Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
58
    def __init__(self, manager: "Manager", demo_mode: bool = False) -> None:
chenych's avatar
chenych committed
59
        r"""Init a runner."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
60
61
62
        self.manager = manager
        self.demo_mode = demo_mode
        """ Resume """
chenych's avatar
chenych committed
63
        self.trainer: Optional[Popen] = None
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
64
        self.do_train = True
chenych's avatar
chenych committed
65
        self.running_data: dict[Component, Any] = None
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
66
67
68
69
70
71
        """ State """
        self.aborted = False
        self.running = False

    def set_abort(self) -> None:
        self.aborted = True
chenych's avatar
chenych committed
72
73
        if self.trainer is not None:
            abort_process(self.trainer.pid)
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
74

chenych's avatar
chenych committed
75
76
    def _initialize(self, data: dict["Component", Any], do_train: bool, from_preview: bool) -> str:
        r"""Validate the configuration."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
77
78
79
80
81
82
83
84
85
86
87
88
89
        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
        lang, model_name, model_path = get("top.lang"), get("top.model_name"), get("top.model_path")
        dataset = get("train.dataset") if do_train else get("eval.dataset")

        if self.running:
            return ALERTS["err_conflict"][lang]

        if not model_name:
            return ALERTS["err_no_model"][lang]

        if not model_path:
            return ALERTS["err_no_path"][lang]

chenych's avatar
chenych committed
90
        if not dataset:
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
91
92
93
94
95
96
            return ALERTS["err_no_dataset"][lang]

        if not from_preview and self.demo_mode:
            return ALERTS["err_demo"][lang]

        if do_train:
chenych's avatar
chenych committed
97
98
99
            if not get("train.output_dir"):
                return ALERTS["err_no_output_dir"][lang]

luopl's avatar
luopl committed
100
101
102
103
104
            try:
                json.loads(get("train.extra_args"))
            except json.JSONDecodeError:
                return ALERTS["err_json_schema"][lang]

Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
105
            stage = TRAINING_STAGES[get("train.training_stage")]
chenych's avatar
chenych committed
106
            if stage == "ppo" and not get("train.reward_model"):
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
107
                return ALERTS["err_no_reward_model"][lang]
chenych's avatar
chenych committed
108
109
110
        else:
            if not get("eval.output_dir"):
                return ALERTS["err_no_output_dir"][lang]
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
111

chenych's avatar
chenych committed
112
        if not from_preview and not is_accelerator_available():
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
113
114
115
116
117
            gr.Warning(ALERTS["warn_no_cuda"][lang])

        return ""

    def _finalize(self, lang: str, finish_info: str) -> str:
chenych's avatar
chenych committed
118
        r"""Clean the cached memory and resets the runner."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
119
        finish_info = ALERTS["info_aborted"][lang] if self.aborted else finish_info
luopl's avatar
luopl committed
120
        gr.Info(finish_info)
chenych's avatar
chenych committed
121
        self.trainer = None
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
122
123
124
125
126
127
        self.aborted = False
        self.running = False
        self.running_data = None
        torch_gc()
        return finish_info

chenych's avatar
chenych committed
128
129
    def _parse_train_args(self, data: dict["Component", Any]) -> dict[str, Any]:
        r"""Build and validate the training arguments."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
130
        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
chenych's avatar
chenych committed
131
        model_name, finetuning_type = get("top.model_name"), get("top.finetuning_type")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
132
133
134
135
136
137
138
        user_config = load_config()

        args = dict(
            stage=TRAINING_STAGES[get("train.training_stage")],
            do_train=True,
            model_name_or_path=get("top.model_path"),
            cache_dir=user_config.get("cache_dir", None),
chenych's avatar
chenych committed
139
140
            preprocessing_num_workers=16,
            finetuning_type=finetuning_type,
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
141
            template=get("top.template"),
chenych's avatar
chenych committed
142
            rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") != "none" else None,
chenych's avatar
chenych committed
143
            flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
144
            use_unsloth=(get("top.booster") == "unsloth"),
luopl's avatar
luopl committed
145
            enable_liger_kernel=(get("top.booster") == "liger_kernel"),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
146
147
148
149
150
151
152
153
154
155
156
157
158
159
            dataset_dir=get("train.dataset_dir"),
            dataset=",".join(get("train.dataset")),
            cutoff_len=get("train.cutoff_len"),
            learning_rate=float(get("train.learning_rate")),
            num_train_epochs=float(get("train.num_train_epochs")),
            max_samples=int(get("train.max_samples")),
            per_device_train_batch_size=get("train.batch_size"),
            gradient_accumulation_steps=get("train.gradient_accumulation_steps"),
            lr_scheduler_type=get("train.lr_scheduler_type"),
            max_grad_norm=float(get("train.max_grad_norm")),
            logging_steps=get("train.logging_steps"),
            save_steps=get("train.save_steps"),
            warmup_steps=get("train.warmup_steps"),
            neftune_noise_alpha=get("train.neftune_alpha") or None,
chenych's avatar
chenych committed
160
161
162
163
            packing=get("train.packing") or get("train.neat_packing"),
            neat_packing=get("train.neat_packing"),
            train_on_prompt=get("train.train_on_prompt"),
            mask_history=get("train.mask_history"),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
164
165
            resize_vocab=get("train.resize_vocab"),
            use_llama_pro=get("train.use_llama_pro"),
chenych's avatar
chenych committed
166
            enable_thinking=get("train.enable_thinking"),
chenych's avatar
chenych committed
167
            report_to=get("train.report_to"),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
168
            use_galore=get("train.use_galore"),
luopl's avatar
luopl committed
169
            use_apollo=get("train.use_apollo"),
chenych's avatar
chenych committed
170
            use_badam=get("train.use_badam"),
luopl's avatar
luopl committed
171
            use_swanlab=get("train.use_swanlab"),
chenych's avatar
chenych committed
172
            output_dir=get_save_dir(model_name, finetuning_type, get("train.output_dir")),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
173
174
175
            fp16=(get("train.compute_type") == "fp16"),
            bf16=(get("train.compute_type") == "bf16"),
            pure_bf16=(get("train.compute_type") == "pure_bf16"),
chenych's avatar
chenych committed
176
            plot_loss=True,
luopl's avatar
luopl committed
177
            trust_remote_code=True,
chenych's avatar
chenych committed
178
            ddp_timeout=180000000,
chenych's avatar
chenych committed
179
            include_num_input_tokens_seen=True,
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
180
        )
luopl's avatar
luopl committed
181
        args.update(json.loads(get("train.extra_args")))
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
182

chenych's avatar
chenych committed
183
184
185
186
187
188
189
190
191
192
        # checkpoints
        if get("top.checkpoint_path"):
            if finetuning_type in PEFT_METHODS:  # list
                args["adapter_name_or_path"] = ",".join(
                    [get_save_dir(model_name, finetuning_type, adapter) for adapter in get("top.checkpoint_path")]
                )
            else:  # str
                args["model_name_or_path"] = get_save_dir(model_name, finetuning_type, get("top.checkpoint_path"))

        # quantization
chenych's avatar
chenych committed
193
        if get("top.quantization_bit") != "none":
chenych's avatar
chenych committed
194
195
            args["quantization_bit"] = int(get("top.quantization_bit"))
            args["quantization_method"] = get("top.quantization_method")
luopl's avatar
luopl committed
196
            args["double_quantization"] = not is_torch_npu_available()
chenych's avatar
chenych committed
197
198

        # freeze config
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
199
        if args["finetuning_type"] == "freeze":
chenych's avatar
chenych committed
200
201
202
203
204
205
            args["freeze_trainable_layers"] = get("train.freeze_trainable_layers")
            args["freeze_trainable_modules"] = get("train.freeze_trainable_modules")
            args["freeze_extra_modules"] = get("train.freeze_extra_modules") or None

        # lora config
        if args["finetuning_type"] == "lora":
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
206
207
208
209
210
211
212
            args["lora_rank"] = get("train.lora_rank")
            args["lora_alpha"] = get("train.lora_alpha")
            args["lora_dropout"] = get("train.lora_dropout")
            args["loraplus_lr_ratio"] = get("train.loraplus_lr_ratio") or None
            args["create_new_adapter"] = get("train.create_new_adapter")
            args["use_rslora"] = get("train.use_rslora")
            args["use_dora"] = get("train.use_dora")
chenych's avatar
chenych committed
213
214
215
            args["pissa_init"] = get("train.use_pissa")
            args["pissa_convert"] = get("train.use_pissa")
            args["lora_target"] = get("train.lora_target") or "all"
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
216
217
218
            args["additional_target"] = get("train.additional_target") or None

            if args["use_llama_pro"]:
chenych's avatar
chenych committed
219
                args["freeze_trainable_layers"] = get("train.freeze_trainable_layers")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
220

chenych's avatar
chenych committed
221
        # rlhf config
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
222
        if args["stage"] == "ppo":
chenych's avatar
chenych committed
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
            if finetuning_type in PEFT_METHODS:
                args["reward_model"] = ",".join(
                    [get_save_dir(model_name, finetuning_type, adapter) for adapter in get("train.reward_model")]
                )
            else:
                args["reward_model"] = get_save_dir(model_name, finetuning_type, get("train.reward_model"))

            args["reward_model_type"] = "lora" if finetuning_type == "lora" else "full"
            args["ppo_score_norm"] = get("train.ppo_score_norm")
            args["ppo_whiten_rewards"] = get("train.ppo_whiten_rewards")
            args["top_k"] = 0
            args["top_p"] = 0.9
        elif args["stage"] in ["dpo", "kto"]:
            args["pref_beta"] = get("train.pref_beta")
            args["pref_ftx"] = get("train.pref_ftx")
            args["pref_loss"] = get("train.pref_loss")

chenych's avatar
chenych committed
240
241
242
243
244
245
246
247
248
249
        # multimodal config
        if model_name in MULTIMODAL_SUPPORTED_MODELS:
            args["freeze_vision_tower"] = get("train.freeze_vision_tower")
            args["freeze_multi_modal_projector"] = get("train.freeze_multi_modal_projector")
            args["freeze_language_model"] = get("train.freeze_language_model")
            args["image_max_pixels"] = calculate_pixels(get("train.image_max_pixels"))
            args["image_min_pixels"] = calculate_pixels(get("train.image_min_pixels"))
            args["video_max_pixels"] = calculate_pixels(get("train.video_max_pixels"))
            args["video_min_pixels"] = calculate_pixels(get("train.video_min_pixels"))

chenych's avatar
chenych committed
250
251
252
253
254
255
256
        # galore config
        if args["use_galore"]:
            args["galore_rank"] = get("train.galore_rank")
            args["galore_update_interval"] = get("train.galore_update_interval")
            args["galore_scale"] = get("train.galore_scale")
            args["galore_target"] = get("train.galore_target")

luopl's avatar
luopl committed
257
258
259
260
261
262
263
        # apollo config
        if args["use_apollo"]:
            args["apollo_rank"] = get("train.apollo_rank")
            args["apollo_update_interval"] = get("train.apollo_update_interval")
            args["apollo_scale"] = get("train.apollo_scale")
            args["apollo_target"] = get("train.apollo_target")

chenych's avatar
chenych committed
264
265
266
267
268
269
        # badam config
        if args["use_badam"]:
            args["badam_mode"] = get("train.badam_mode")
            args["badam_switch_mode"] = get("train.badam_switch_mode")
            args["badam_switch_interval"] = get("train.badam_switch_interval")
            args["badam_update_ratio"] = get("train.badam_update_ratio")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
270

luopl's avatar
luopl committed
271
272
273
274
275
276
277
278
        # swanlab config
        if get("train.use_swanlab"):
            args["swanlab_project"] = get("train.swanlab_project")
            args["swanlab_run_name"] = get("train.swanlab_run_name")
            args["swanlab_workspace"] = get("train.swanlab_workspace")
            args["swanlab_api_key"] = get("train.swanlab_api_key")
            args["swanlab_mode"] = get("train.swanlab_mode")

chenych's avatar
chenych committed
279
        # eval config
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
280
281
        if get("train.val_size") > 1e-6 and args["stage"] != "ppo":
            args["val_size"] = get("train.val_size")
chenych's avatar
chenych committed
282
            args["eval_strategy"] = "steps"
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
283
284
285
            args["eval_steps"] = args["save_steps"]
            args["per_device_eval_batch_size"] = args["per_device_train_batch_size"]

chenych's avatar
chenych committed
286
287
288
289
        # ds config
        if get("train.ds_stage") != "none":
            ds_stage = get("train.ds_stage")
            ds_offload = "offload_" if get("train.ds_offload") else ""
luopl's avatar
luopl committed
290
            args["deepspeed"] = os.path.join(DEFAULT_CACHE_DIR, f"ds_z{ds_stage}_{ds_offload}config.json")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
291
292
293

        return args

chenych's avatar
chenych committed
294
295
    def _parse_eval_args(self, data: dict["Component", Any]) -> dict[str, Any]:
        r"""Build and validate the evaluation arguments."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
296
        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
chenych's avatar
chenych committed
297
        model_name, finetuning_type = get("top.model_name"), get("top.finetuning_type")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
298
299
300
301
302
303
        user_config = load_config()

        args = dict(
            stage="sft",
            model_name_or_path=get("top.model_path"),
            cache_dir=user_config.get("cache_dir", None),
chenych's avatar
chenych committed
304
305
306
            preprocessing_num_workers=16,
            finetuning_type=finetuning_type,
            quantization_method=get("top.quantization_method"),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
307
            template=get("top.template"),
chenych's avatar
chenych committed
308
            rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") != "none" else None,
chenych's avatar
chenych committed
309
            flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
310
311
            use_unsloth=(get("top.booster") == "unsloth"),
            dataset_dir=get("eval.dataset_dir"),
chenych's avatar
chenych committed
312
            eval_dataset=",".join(get("eval.dataset")),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
313
314
315
316
317
318
319
            cutoff_len=get("eval.cutoff_len"),
            max_samples=int(get("eval.max_samples")),
            per_device_eval_batch_size=get("eval.batch_size"),
            predict_with_generate=True,
            max_new_tokens=get("eval.max_new_tokens"),
            top_p=get("eval.top_p"),
            temperature=get("eval.temperature"),
chenych's avatar
chenych committed
320
            output_dir=get_save_dir(model_name, finetuning_type, get("eval.output_dir")),
luopl's avatar
luopl committed
321
            trust_remote_code=True,
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
322
323
324
325
326
327
328
        )

        if get("eval.predict"):
            args["do_predict"] = True
        else:
            args["do_eval"] = True

chenych's avatar
chenych committed
329
330
331
332
333
334
335
336
337
338
        # checkpoints
        if get("top.checkpoint_path"):
            if finetuning_type in PEFT_METHODS:  # list
                args["adapter_name_or_path"] = ",".join(
                    [get_save_dir(model_name, finetuning_type, adapter) for adapter in get("top.checkpoint_path")]
                )
            else:  # str
                args["model_name_or_path"] = get_save_dir(model_name, finetuning_type, get("top.checkpoint_path"))

        # quantization
chenych's avatar
chenych committed
339
        if get("top.quantization_bit") != "none":
chenych's avatar
chenych committed
340
341
            args["quantization_bit"] = int(get("top.quantization_bit"))
            args["quantization_method"] = get("top.quantization_method")
chenych's avatar
chenych committed
342
            args["double_quantization"] = not is_torch_npu_available()
chenych's avatar
chenych committed
343

Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
344
345
        return args

chenych's avatar
chenych committed
346
347
    def _preview(self, data: dict["Component", Any], do_train: bool) -> Generator[dict["Component", str], None, None]:
        r"""Preview the training commands."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
348
349
350
351
352
353
354
355
356
        output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if do_train else "eval"))
        error = self._initialize(data, do_train, from_preview=True)
        if error:
            gr.Warning(error)
            yield {output_box: error}
        else:
            args = self._parse_train_args(data) if do_train else self._parse_eval_args(data)
            yield {output_box: gen_cmd(args)}

chenych's avatar
chenych committed
357
358
    def _launch(self, data: dict["Component", Any], do_train: bool) -> Generator[dict["Component", Any], None, None]:
        r"""Start the training process."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
359
360
361
362
363
364
365
        output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if do_train else "eval"))
        error = self._initialize(data, do_train, from_preview=False)
        if error:
            gr.Warning(error)
            yield {output_box: error}
        else:
            self.do_train, self.running_data = do_train, data
chenych's avatar
chenych committed
366
367
368
            args = self._parse_train_args(data) if do_train else self._parse_eval_args(data)

            os.makedirs(args["output_dir"], exist_ok=True)
chenych's avatar
chenych committed
369
            save_args(os.path.join(args["output_dir"], LLAMABOARD_CONFIG), self._build_config_dict(data))
chenych's avatar
chenych committed
370
371
372
373
374
375
376

            env = deepcopy(os.environ)
            env["LLAMABOARD_ENABLED"] = "1"
            env["LLAMABOARD_WORKDIR"] = args["output_dir"]
            if args.get("deepspeed", None) is not None:
                env["FORCE_TORCHRUN"] = "1"

chenych's avatar
chenych committed
377
            # NOTE: DO NOT USE shell=True to avoid security risk
luopl's avatar
luopl committed
378
            self.trainer = Popen(["llamafactory-cli", "train", save_cmd(args)], env=env)
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
379
380
            yield from self.monitor()

chenych's avatar
chenych committed
381
382
    def _build_config_dict(self, data: dict["Component", Any]) -> dict[str, Any]:
        r"""Build a dictionary containing the current training configuration."""
chenych's avatar
chenych committed
383
384
385
386
387
388
389
390
391
        config_dict = {}
        skip_ids = ["top.lang", "top.model_path", "train.output_dir", "train.config_path"]
        for elem, value in data.items():
            elem_id = self.manager.get_id_by_elem(elem)
            if elem_id not in skip_ids:
                config_dict[elem_id] = value

        return config_dict

Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
392
393
394
395
396
397
398
399
400
401
402
403
404
    def preview_train(self, data):
        yield from self._preview(data, do_train=True)

    def preview_eval(self, data):
        yield from self._preview(data, do_train=False)

    def run_train(self, data):
        yield from self._launch(data, do_train=True)

    def run_eval(self, data):
        yield from self._launch(data, do_train=False)

    def monitor(self):
chenych's avatar
chenych committed
405
        r"""Monitorgit the training progress and logs."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
406
407
408
        self.aborted = False
        self.running = True

chenych's avatar
chenych committed
409
410
        get = lambda elem_id: self.running_data[self.manager.get_elem_by_id(elem_id)]
        lang, model_name, finetuning_type = get("top.lang"), get("top.model_name"), get("top.finetuning_type")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
411
412
413
414
        output_dir = get("{}.output_dir".format("train" if self.do_train else "eval"))
        output_path = get_save_dir(model_name, finetuning_type, output_dir)

        output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if self.do_train else "eval"))
chenych's avatar
chenych committed
415
        progress_bar = self.manager.get_elem_by_id("{}.progress_bar".format("train" if self.do_train else "eval"))
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
416
        loss_viewer = self.manager.get_elem_by_id("train.loss_viewer") if self.do_train else None
chenych's avatar
chenych committed
417
        swanlab_link = self.manager.get_elem_by_id("train.swanlab_link") if self.do_train else None
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
418

luopl's avatar
luopl committed
419
        running_log = ""
chenych's avatar
chenych committed
420
        while self.trainer is not None:
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
421
422
423
            if self.aborted:
                yield {
                    output_box: ALERTS["info_aborting"][lang],
chenych's avatar
chenych committed
424
                    progress_bar: gr.Slider(visible=False),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
425
426
                }
            else:
chenych's avatar
chenych committed
427
                running_log, running_progress, running_info = get_trainer_info(lang, output_path, self.do_train)
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
428
                return_dict = {
chenych's avatar
chenych committed
429
430
                    output_box: running_log,
                    progress_bar: running_progress,
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
431
                }
chenych's avatar
chenych committed
432
433
                if "loss_viewer" in running_info:
                    return_dict[loss_viewer] = running_info["loss_viewer"]
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
434

chenych's avatar
chenych committed
435
436
                if "swanlab_link" in running_info:
                    return_dict[swanlab_link] = running_info["swanlab_link"]
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
437

chenych's avatar
chenych committed
438
                yield return_dict
chenych's avatar
chenych committed
439
440
441
442
443
            try:
                self.trainer.wait(2)
                self.trainer = None
            except TimeoutExpired:
                continue
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
444
445

        if self.do_train:
luopl's avatar
luopl committed
446
            if os.path.exists(os.path.join(output_path, TRAINING_ARGS_NAME)) or use_ray():
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
447
448
449
450
                finish_info = ALERTS["info_finished"][lang]
            else:
                finish_info = ALERTS["err_failed"][lang]
        else:
luopl's avatar
luopl committed
451
            if os.path.exists(os.path.join(output_path, "all_results.json")) or use_ray():
chenych's avatar
chenych committed
452
                finish_info = load_eval_results(os.path.join(output_path, "all_results.json"))
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
453
454
455
456
            else:
                finish_info = ALERTS["err_failed"][lang]

        return_dict = {
luopl's avatar
luopl committed
457
            output_box: self._finalize(lang, finish_info) + "\n\n" + running_log,
chenych's avatar
chenych committed
458
            progress_bar: gr.Slider(visible=False),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
459
460
461
462
        }
        yield return_dict

    def save_args(self, data):
chenych's avatar
chenych committed
463
        r"""Save the training configuration to config path."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
464
465
466
467
468
469
470
471
        output_box = self.manager.get_elem_by_id("train.output_box")
        error = self._initialize(data, do_train=True, from_preview=True)
        if error:
            gr.Warning(error)
            return {output_box: error}

        lang = data[self.manager.get_elem_by_id("top.lang")]
        config_path = data[self.manager.get_elem_by_id("train.config_path")]
chenych's avatar
chenych committed
472
473
        os.makedirs(DEFAULT_CONFIG_DIR, exist_ok=True)
        save_path = os.path.join(DEFAULT_CONFIG_DIR, config_path)
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
474

chenych's avatar
chenych committed
475
        save_args(save_path, self._build_config_dict(data))
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
476
477
478
        return {output_box: ALERTS["info_config_saved"][lang] + save_path}

    def load_args(self, lang: str, config_path: str):
chenych's avatar
chenych committed
479
        r"""Load the training configuration from config path."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
480
        output_box = self.manager.get_elem_by_id("train.output_box")
chenych's avatar
chenych committed
481
        config_dict = load_args(os.path.join(DEFAULT_CONFIG_DIR, config_path))
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
482
483
484
485
        if config_dict is None:
            gr.Warning(ALERTS["err_config_not_found"][lang])
            return {output_box: ALERTS["err_config_not_found"][lang]}

chenych's avatar
chenych committed
486
        output_dict: dict[Component, Any] = {output_box: ALERTS["info_config_loaded"][lang]}
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
487
488
489
490
        for elem_id, value in config_dict.items():
            output_dict[self.manager.get_elem_by_id(elem_id)] = value

        return output_dict
chenych's avatar
chenych committed
491
492

    def check_output_dir(self, lang: str, model_name: str, finetuning_type: str, output_dir: str):
chenych's avatar
chenych committed
493
        r"""Restore the training status if output_dir exists."""
chenych's avatar
chenych committed
494
        output_box = self.manager.get_elem_by_id("train.output_box")
chenych's avatar
chenych committed
495
        output_dict: dict[Component, Any] = {output_box: LOCALES["output_box"][lang]["value"]}
chenych's avatar
chenych committed
496
497
498
499
500
501
502
503
504
505
        if model_name and output_dir and os.path.isdir(get_save_dir(model_name, finetuning_type, output_dir)):
            gr.Warning(ALERTS["warn_output_dir_exists"][lang])
            output_dict[output_box] = ALERTS["warn_output_dir_exists"][lang]

            output_dir = get_save_dir(model_name, finetuning_type, output_dir)
            config_dict = load_args(os.path.join(output_dir, LLAMABOARD_CONFIG))  # load llamaboard config
            for elem_id, value in config_dict.items():
                output_dict[self.manager.get_elem_by_id(elem_id)] = value

        return output_dict