runner.py 22.1 KB
Newer Older
chenych's avatar
chenych committed
1
# Copyright 2025 the LlamaFactory team.
chenych's avatar
chenych committed
2
3
4
5
6
7
8
9
10
11
12
13
14
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

luopl's avatar
luopl committed
15
import json
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
16
import os
chenych's avatar
chenych committed
17
from collections.abc import Generator
chenych's avatar
chenych committed
18
from copy import deepcopy
chenych's avatar
chenych committed
19
from subprocess import PIPE, Popen, TimeoutExpired
chenych's avatar
chenych committed
20
from typing import TYPE_CHECKING, Any, Optional
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
21

luopl's avatar
luopl committed
22
from transformers.utils import is_torch_npu_available
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
23

chenych's avatar
chenych committed
24
from ..extras.constants import LLAMABOARD_CONFIG, MULTIMODAL_SUPPORTED_MODELS, PEFT_METHODS, TRAINING_STAGES
chenych's avatar
chenych committed
25
from ..extras.misc import is_accelerator_available, torch_gc
chenych's avatar
chenych committed
26
27
28
29
30
from ..extras.packages import is_gradio_available
from .common import (
    DEFAULT_CACHE_DIR,
    DEFAULT_CONFIG_DIR,
    abort_process,
chenych's avatar
chenych committed
31
    calculate_pixels,
chenych's avatar
chenych committed
32
33
34
35
36
37
38
39
40
    gen_cmd,
    get_save_dir,
    load_args,
    load_config,
    load_eval_results,
    save_args,
    save_cmd,
)
from .control import get_trainer_info
chenych's avatar
chenych committed
41
from .locales import ALERTS, LOCALES
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
42
43
44
45
46
47
48
49
50
51
52
53
54


if is_gradio_available():
    import gradio as gr


if TYPE_CHECKING:
    from gradio.components import Component

    from .manager import Manager


class Runner:
chenych's avatar
chenych committed
55
    r"""A class to manage the running status of the trainers."""
chenych's avatar
chenych committed
56

Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
57
    def __init__(self, manager: "Manager", demo_mode: bool = False) -> None:
chenych's avatar
chenych committed
58
        r"""Init a runner."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
59
60
61
        self.manager = manager
        self.demo_mode = demo_mode
        """ Resume """
chenych's avatar
chenych committed
62
        self.trainer: Optional[Popen] = None
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
63
        self.do_train = True
chenych's avatar
chenych committed
64
        self.running_data: dict[Component, Any] = None
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
65
66
67
68
69
70
        """ State """
        self.aborted = False
        self.running = False

    def set_abort(self) -> None:
        self.aborted = True
chenych's avatar
chenych committed
71
72
        if self.trainer is not None:
            abort_process(self.trainer.pid)
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
73

chenych's avatar
chenych committed
74
75
    def _initialize(self, data: dict["Component", Any], do_train: bool, from_preview: bool) -> str:
        r"""Validate the configuration."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
76
77
78
79
80
81
82
83
84
85
86
87
88
        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
        lang, model_name, model_path = get("top.lang"), get("top.model_name"), get("top.model_path")
        dataset = get("train.dataset") if do_train else get("eval.dataset")

        if self.running:
            return ALERTS["err_conflict"][lang]

        if not model_name:
            return ALERTS["err_no_model"][lang]

        if not model_path:
            return ALERTS["err_no_path"][lang]

chenych's avatar
chenych committed
89
        if not dataset:
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
90
91
92
93
94
95
            return ALERTS["err_no_dataset"][lang]

        if not from_preview and self.demo_mode:
            return ALERTS["err_demo"][lang]

        if do_train:
chenych's avatar
chenych committed
96
97
98
            if not get("train.output_dir"):
                return ALERTS["err_no_output_dir"][lang]

luopl's avatar
luopl committed
99
100
101
102
103
            try:
                json.loads(get("train.extra_args"))
            except json.JSONDecodeError:
                return ALERTS["err_json_schema"][lang]

Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
104
            stage = TRAINING_STAGES[get("train.training_stage")]
chenych's avatar
chenych committed
105
            if stage == "ppo" and not get("train.reward_model"):
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
106
                return ALERTS["err_no_reward_model"][lang]
chenych's avatar
chenych committed
107
108
109
        else:
            if not get("eval.output_dir"):
                return ALERTS["err_no_output_dir"][lang]
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
110

chenych's avatar
chenych committed
111
        if not from_preview and not is_accelerator_available():
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
112
113
114
115
            gr.Warning(ALERTS["warn_no_cuda"][lang])

        return ""

chenych's avatar
chenych committed
116
    def _finalize(self, lang: str, finish_info: str) -> None:
chenych's avatar
chenych committed
117
        r"""Clean the cached memory and resets the runner."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
118
        finish_info = ALERTS["info_aborted"][lang] if self.aborted else finish_info
luopl's avatar
luopl committed
119
        gr.Info(finish_info)
chenych's avatar
chenych committed
120
        self.trainer = None
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
121
122
123
124
125
        self.aborted = False
        self.running = False
        self.running_data = None
        torch_gc()

chenych's avatar
chenych committed
126
127
    def _parse_train_args(self, data: dict["Component", Any]) -> dict[str, Any]:
        r"""Build and validate the training arguments."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
128
        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
chenych's avatar
chenych committed
129
        model_name, finetuning_type = get("top.model_name"), get("top.finetuning_type")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
130
131
132
133
134
135
136
        user_config = load_config()

        args = dict(
            stage=TRAINING_STAGES[get("train.training_stage")],
            do_train=True,
            model_name_or_path=get("top.model_path"),
            cache_dir=user_config.get("cache_dir", None),
chenych's avatar
chenych committed
137
138
            preprocessing_num_workers=16,
            finetuning_type=finetuning_type,
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
139
            template=get("top.template"),
chenych's avatar
chenych committed
140
            rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") != "none" else None,
chenych's avatar
chenych committed
141
            flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
142
            use_unsloth=(get("top.booster") == "unsloth"),
luopl's avatar
luopl committed
143
            enable_liger_kernel=(get("top.booster") == "liger_kernel"),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
144
145
146
147
148
149
150
151
152
153
154
155
156
157
            dataset_dir=get("train.dataset_dir"),
            dataset=",".join(get("train.dataset")),
            cutoff_len=get("train.cutoff_len"),
            learning_rate=float(get("train.learning_rate")),
            num_train_epochs=float(get("train.num_train_epochs")),
            max_samples=int(get("train.max_samples")),
            per_device_train_batch_size=get("train.batch_size"),
            gradient_accumulation_steps=get("train.gradient_accumulation_steps"),
            lr_scheduler_type=get("train.lr_scheduler_type"),
            max_grad_norm=float(get("train.max_grad_norm")),
            logging_steps=get("train.logging_steps"),
            save_steps=get("train.save_steps"),
            warmup_steps=get("train.warmup_steps"),
            neftune_noise_alpha=get("train.neftune_alpha") or None,
chenych's avatar
chenych committed
158
159
160
161
            packing=get("train.packing") or get("train.neat_packing"),
            neat_packing=get("train.neat_packing"),
            train_on_prompt=get("train.train_on_prompt"),
            mask_history=get("train.mask_history"),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
162
163
            resize_vocab=get("train.resize_vocab"),
            use_llama_pro=get("train.use_llama_pro"),
chenych's avatar
chenych committed
164
            enable_thinking=get("train.enable_thinking"),
chenych's avatar
chenych committed
165
            report_to=get("train.report_to"),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
166
            use_galore=get("train.use_galore"),
luopl's avatar
luopl committed
167
            use_apollo=get("train.use_apollo"),
chenych's avatar
chenych committed
168
            use_badam=get("train.use_badam"),
luopl's avatar
luopl committed
169
            use_swanlab=get("train.use_swanlab"),
chenych's avatar
chenych committed
170
            output_dir=get_save_dir(model_name, finetuning_type, get("train.output_dir")),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
171
172
173
            fp16=(get("train.compute_type") == "fp16"),
            bf16=(get("train.compute_type") == "bf16"),
            pure_bf16=(get("train.compute_type") == "pure_bf16"),
chenych's avatar
chenych committed
174
            plot_loss=True,
luopl's avatar
luopl committed
175
            trust_remote_code=True,
chenych's avatar
chenych committed
176
            ddp_timeout=180000000,
chenych's avatar
chenych committed
177
            include_num_input_tokens_seen=True,
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
178
        )
luopl's avatar
luopl committed
179
        args.update(json.loads(get("train.extra_args")))
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
180

chenych's avatar
chenych committed
181
182
183
184
185
186
187
188
189
190
        # checkpoints
        if get("top.checkpoint_path"):
            if finetuning_type in PEFT_METHODS:  # list
                args["adapter_name_or_path"] = ",".join(
                    [get_save_dir(model_name, finetuning_type, adapter) for adapter in get("top.checkpoint_path")]
                )
            else:  # str
                args["model_name_or_path"] = get_save_dir(model_name, finetuning_type, get("top.checkpoint_path"))

        # quantization
chenych's avatar
chenych committed
191
        if get("top.quantization_bit") != "none":
chenych's avatar
chenych committed
192
193
            args["quantization_bit"] = int(get("top.quantization_bit"))
            args["quantization_method"] = get("top.quantization_method")
luopl's avatar
luopl committed
194
            args["double_quantization"] = not is_torch_npu_available()
chenych's avatar
chenych committed
195
196

        # freeze config
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
197
        if args["finetuning_type"] == "freeze":
chenych's avatar
chenych committed
198
199
200
201
202
203
            args["freeze_trainable_layers"] = get("train.freeze_trainable_layers")
            args["freeze_trainable_modules"] = get("train.freeze_trainable_modules")
            args["freeze_extra_modules"] = get("train.freeze_extra_modules") or None

        # lora config
        if args["finetuning_type"] == "lora":
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
204
205
206
207
208
209
210
            args["lora_rank"] = get("train.lora_rank")
            args["lora_alpha"] = get("train.lora_alpha")
            args["lora_dropout"] = get("train.lora_dropout")
            args["loraplus_lr_ratio"] = get("train.loraplus_lr_ratio") or None
            args["create_new_adapter"] = get("train.create_new_adapter")
            args["use_rslora"] = get("train.use_rslora")
            args["use_dora"] = get("train.use_dora")
chenych's avatar
chenych committed
211
212
213
            args["pissa_init"] = get("train.use_pissa")
            args["pissa_convert"] = get("train.use_pissa")
            args["lora_target"] = get("train.lora_target") or "all"
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
214
215
216
            args["additional_target"] = get("train.additional_target") or None

            if args["use_llama_pro"]:
chenych's avatar
chenych committed
217
                args["freeze_trainable_layers"] = get("train.freeze_trainable_layers")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
218

chenych's avatar
chenych committed
219
        # rlhf config
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
220
        if args["stage"] == "ppo":
chenych's avatar
chenych committed
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
            if finetuning_type in PEFT_METHODS:
                args["reward_model"] = ",".join(
                    [get_save_dir(model_name, finetuning_type, adapter) for adapter in get("train.reward_model")]
                )
            else:
                args["reward_model"] = get_save_dir(model_name, finetuning_type, get("train.reward_model"))

            args["reward_model_type"] = "lora" if finetuning_type == "lora" else "full"
            args["ppo_score_norm"] = get("train.ppo_score_norm")
            args["ppo_whiten_rewards"] = get("train.ppo_whiten_rewards")
            args["top_k"] = 0
            args["top_p"] = 0.9
        elif args["stage"] in ["dpo", "kto"]:
            args["pref_beta"] = get("train.pref_beta")
            args["pref_ftx"] = get("train.pref_ftx")
            args["pref_loss"] = get("train.pref_loss")

chenych's avatar
chenych committed
238
239
240
241
242
243
244
245
246
247
        # multimodal config
        if model_name in MULTIMODAL_SUPPORTED_MODELS:
            args["freeze_vision_tower"] = get("train.freeze_vision_tower")
            args["freeze_multi_modal_projector"] = get("train.freeze_multi_modal_projector")
            args["freeze_language_model"] = get("train.freeze_language_model")
            args["image_max_pixels"] = calculate_pixels(get("train.image_max_pixels"))
            args["image_min_pixels"] = calculate_pixels(get("train.image_min_pixels"))
            args["video_max_pixels"] = calculate_pixels(get("train.video_max_pixels"))
            args["video_min_pixels"] = calculate_pixels(get("train.video_min_pixels"))

chenych's avatar
chenych committed
248
249
250
251
252
253
254
        # galore config
        if args["use_galore"]:
            args["galore_rank"] = get("train.galore_rank")
            args["galore_update_interval"] = get("train.galore_update_interval")
            args["galore_scale"] = get("train.galore_scale")
            args["galore_target"] = get("train.galore_target")

luopl's avatar
luopl committed
255
256
257
258
259
260
261
        # apollo config
        if args["use_apollo"]:
            args["apollo_rank"] = get("train.apollo_rank")
            args["apollo_update_interval"] = get("train.apollo_update_interval")
            args["apollo_scale"] = get("train.apollo_scale")
            args["apollo_target"] = get("train.apollo_target")

chenych's avatar
chenych committed
262
263
264
265
266
267
        # badam config
        if args["use_badam"]:
            args["badam_mode"] = get("train.badam_mode")
            args["badam_switch_mode"] = get("train.badam_switch_mode")
            args["badam_switch_interval"] = get("train.badam_switch_interval")
            args["badam_update_ratio"] = get("train.badam_update_ratio")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
268

luopl's avatar
luopl committed
269
270
271
272
273
274
275
276
        # swanlab config
        if get("train.use_swanlab"):
            args["swanlab_project"] = get("train.swanlab_project")
            args["swanlab_run_name"] = get("train.swanlab_run_name")
            args["swanlab_workspace"] = get("train.swanlab_workspace")
            args["swanlab_api_key"] = get("train.swanlab_api_key")
            args["swanlab_mode"] = get("train.swanlab_mode")

chenych's avatar
chenych committed
277
        # eval config
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
278
279
        if get("train.val_size") > 1e-6 and args["stage"] != "ppo":
            args["val_size"] = get("train.val_size")
chenych's avatar
chenych committed
280
            args["eval_strategy"] = "steps"
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
281
282
283
            args["eval_steps"] = args["save_steps"]
            args["per_device_eval_batch_size"] = args["per_device_train_batch_size"]

chenych's avatar
chenych committed
284
285
286
287
        # ds config
        if get("train.ds_stage") != "none":
            ds_stage = get("train.ds_stage")
            ds_offload = "offload_" if get("train.ds_offload") else ""
luopl's avatar
luopl committed
288
            args["deepspeed"] = os.path.join(DEFAULT_CACHE_DIR, f"ds_z{ds_stage}_{ds_offload}config.json")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
289
290
291

        return args

chenych's avatar
chenych committed
292
293
    def _parse_eval_args(self, data: dict["Component", Any]) -> dict[str, Any]:
        r"""Build and validate the evaluation arguments."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
294
        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
chenych's avatar
chenych committed
295
        model_name, finetuning_type = get("top.model_name"), get("top.finetuning_type")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
296
297
298
299
300
301
        user_config = load_config()

        args = dict(
            stage="sft",
            model_name_or_path=get("top.model_path"),
            cache_dir=user_config.get("cache_dir", None),
chenych's avatar
chenych committed
302
303
304
            preprocessing_num_workers=16,
            finetuning_type=finetuning_type,
            quantization_method=get("top.quantization_method"),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
305
            template=get("top.template"),
chenych's avatar
chenych committed
306
            rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") != "none" else None,
chenych's avatar
chenych committed
307
            flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
308
309
            use_unsloth=(get("top.booster") == "unsloth"),
            dataset_dir=get("eval.dataset_dir"),
chenych's avatar
chenych committed
310
            eval_dataset=",".join(get("eval.dataset")),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
311
312
313
314
            cutoff_len=get("eval.cutoff_len"),
            max_samples=int(get("eval.max_samples")),
            per_device_eval_batch_size=get("eval.batch_size"),
            predict_with_generate=True,
chenych's avatar
chenych committed
315
            report_to="none",
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
316
317
318
            max_new_tokens=get("eval.max_new_tokens"),
            top_p=get("eval.top_p"),
            temperature=get("eval.temperature"),
chenych's avatar
chenych committed
319
            output_dir=get_save_dir(model_name, finetuning_type, get("eval.output_dir")),
luopl's avatar
luopl committed
320
            trust_remote_code=True,
chenych's avatar
chenych committed
321
            ddp_timeout=180000000,
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
322
323
324
325
326
327
328
        )

        if get("eval.predict"):
            args["do_predict"] = True
        else:
            args["do_eval"] = True

chenych's avatar
chenych committed
329
330
331
332
333
334
335
336
337
338
        # checkpoints
        if get("top.checkpoint_path"):
            if finetuning_type in PEFT_METHODS:  # list
                args["adapter_name_or_path"] = ",".join(
                    [get_save_dir(model_name, finetuning_type, adapter) for adapter in get("top.checkpoint_path")]
                )
            else:  # str
                args["model_name_or_path"] = get_save_dir(model_name, finetuning_type, get("top.checkpoint_path"))

        # quantization
chenych's avatar
chenych committed
339
        if get("top.quantization_bit") != "none":
chenych's avatar
chenych committed
340
341
            args["quantization_bit"] = int(get("top.quantization_bit"))
            args["quantization_method"] = get("top.quantization_method")
chenych's avatar
chenych committed
342
            args["double_quantization"] = not is_torch_npu_available()
chenych's avatar
chenych committed
343

Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
344
345
        return args

chenych's avatar
chenych committed
346
347
    def _preview(self, data: dict["Component", Any], do_train: bool) -> Generator[dict["Component", str], None, None]:
        r"""Preview the training commands."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
348
349
350
351
352
353
354
355
356
        output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if do_train else "eval"))
        error = self._initialize(data, do_train, from_preview=True)
        if error:
            gr.Warning(error)
            yield {output_box: error}
        else:
            args = self._parse_train_args(data) if do_train else self._parse_eval_args(data)
            yield {output_box: gen_cmd(args)}

chenych's avatar
chenych committed
357
358
    def _launch(self, data: dict["Component", Any], do_train: bool) -> Generator[dict["Component", Any], None, None]:
        r"""Start the training process."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
359
360
361
362
363
364
365
        output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if do_train else "eval"))
        error = self._initialize(data, do_train, from_preview=False)
        if error:
            gr.Warning(error)
            yield {output_box: error}
        else:
            self.do_train, self.running_data = do_train, data
chenych's avatar
chenych committed
366
367
368
            args = self._parse_train_args(data) if do_train else self._parse_eval_args(data)

            os.makedirs(args["output_dir"], exist_ok=True)
chenych's avatar
chenych committed
369
            save_args(os.path.join(args["output_dir"], LLAMABOARD_CONFIG), self._build_config_dict(data))
chenych's avatar
chenych committed
370
371
372
373
374
375
376

            env = deepcopy(os.environ)
            env["LLAMABOARD_ENABLED"] = "1"
            env["LLAMABOARD_WORKDIR"] = args["output_dir"]
            if args.get("deepspeed", None) is not None:
                env["FORCE_TORCHRUN"] = "1"

chenych's avatar
chenych committed
377
            # NOTE: DO NOT USE shell=True to avoid security risk
chenych's avatar
chenych committed
378
            self.trainer = Popen(["llamafactory-cli", "train", save_cmd(args)], env=env, stderr=PIPE, text=True)
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
379
380
            yield from self.monitor()

chenych's avatar
chenych committed
381
382
    def _build_config_dict(self, data: dict["Component", Any]) -> dict[str, Any]:
        r"""Build a dictionary containing the current training configuration."""
chenych's avatar
chenych committed
383
384
385
386
387
388
389
390
391
        config_dict = {}
        skip_ids = ["top.lang", "top.model_path", "train.output_dir", "train.config_path"]
        for elem, value in data.items():
            elem_id = self.manager.get_id_by_elem(elem)
            if elem_id not in skip_ids:
                config_dict[elem_id] = value

        return config_dict

Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
392
393
394
395
396
397
398
399
400
401
402
403
404
    def preview_train(self, data):
        yield from self._preview(data, do_train=True)

    def preview_eval(self, data):
        yield from self._preview(data, do_train=False)

    def run_train(self, data):
        yield from self._launch(data, do_train=True)

    def run_eval(self, data):
        yield from self._launch(data, do_train=False)

    def monitor(self):
chenych's avatar
chenych committed
405
        r"""Monitorgit the training progress and logs."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
406
407
408
        self.aborted = False
        self.running = True

chenych's avatar
chenych committed
409
410
        get = lambda elem_id: self.running_data[self.manager.get_elem_by_id(elem_id)]
        lang, model_name, finetuning_type = get("top.lang"), get("top.model_name"), get("top.finetuning_type")
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
411
412
413
414
        output_dir = get("{}.output_dir".format("train" if self.do_train else "eval"))
        output_path = get_save_dir(model_name, finetuning_type, output_dir)

        output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if self.do_train else "eval"))
chenych's avatar
chenych committed
415
        progress_bar = self.manager.get_elem_by_id("{}.progress_bar".format("train" if self.do_train else "eval"))
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
416
        loss_viewer = self.manager.get_elem_by_id("train.loss_viewer") if self.do_train else None
chenych's avatar
chenych committed
417
        swanlab_link = self.manager.get_elem_by_id("train.swanlab_link") if self.do_train else None
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
418

luopl's avatar
luopl committed
419
        running_log = ""
chenych's avatar
chenych committed
420
421
        return_code = -1
        while return_code == -1:
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
422
423
424
            if self.aborted:
                yield {
                    output_box: ALERTS["info_aborting"][lang],
chenych's avatar
chenych committed
425
                    progress_bar: gr.Slider(visible=False),
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
426
427
                }
            else:
chenych's avatar
chenych committed
428
                running_log, running_progress, running_info = get_trainer_info(lang, output_path, self.do_train)
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
429
                return_dict = {
chenych's avatar
chenych committed
430
431
                    output_box: running_log,
                    progress_bar: running_progress,
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
432
                }
chenych's avatar
chenych committed
433
434
                if "loss_viewer" in running_info:
                    return_dict[loss_viewer] = running_info["loss_viewer"]
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
435

chenych's avatar
chenych committed
436
437
                if "swanlab_link" in running_info:
                    return_dict[swanlab_link] = running_info["swanlab_link"]
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
438

chenych's avatar
chenych committed
439
                yield return_dict
chenych's avatar
chenych committed
440

chenych's avatar
chenych committed
441
            try:
chenych's avatar
chenych committed
442
443
                stderr = self.trainer.communicate(timeout=2)[1]
                return_code = self.trainer.returncode
chenych's avatar
chenych committed
444
445
            except TimeoutExpired:
                continue
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
446

chenych's avatar
chenych committed
447
448
449
450
        if return_code == 0 or self.aborted:
            finish_info = ALERTS["info_finished"][lang]
            if self.do_train:
                finish_log = ALERTS["info_finished"][lang] + "\n\n" + running_log
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
451
            else:
chenych's avatar
chenych committed
452
                finish_log = load_eval_results(os.path.join(output_path, "all_results.json")) + "\n\n" + running_log
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
453
        else:
chenych's avatar
chenych committed
454
455
456
            print(stderr)
            finish_info = ALERTS["err_failed"][lang]
            finish_log = ALERTS["err_failed"][lang] + f" Exit code: {return_code}\n\n```\n{stderr}\n```\n"
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
457

chenych's avatar
chenych committed
458
459
        self._finalize(lang, finish_info)
        return_dict = {output_box: finish_log, progress_bar: gr.Slider(visible=False)}
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
460
461
462
        yield return_dict

    def save_args(self, data):
chenych's avatar
chenych committed
463
        r"""Save the training configuration to config path."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
464
465
466
467
468
469
470
471
        output_box = self.manager.get_elem_by_id("train.output_box")
        error = self._initialize(data, do_train=True, from_preview=True)
        if error:
            gr.Warning(error)
            return {output_box: error}

        lang = data[self.manager.get_elem_by_id("top.lang")]
        config_path = data[self.manager.get_elem_by_id("train.config_path")]
chenych's avatar
chenych committed
472
473
        os.makedirs(DEFAULT_CONFIG_DIR, exist_ok=True)
        save_path = os.path.join(DEFAULT_CONFIG_DIR, config_path)
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
474

chenych's avatar
chenych committed
475
        save_args(save_path, self._build_config_dict(data))
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
476
477
478
        return {output_box: ALERTS["info_config_saved"][lang] + save_path}

    def load_args(self, lang: str, config_path: str):
chenych's avatar
chenych committed
479
        r"""Load the training configuration from config path."""
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
480
        output_box = self.manager.get_elem_by_id("train.output_box")
chenych's avatar
chenych committed
481
        config_dict = load_args(os.path.join(DEFAULT_CONFIG_DIR, config_path))
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
482
483
484
485
        if config_dict is None:
            gr.Warning(ALERTS["err_config_not_found"][lang])
            return {output_box: ALERTS["err_config_not_found"][lang]}

chenych's avatar
chenych committed
486
        output_dict: dict[Component, Any] = {output_box: ALERTS["info_config_loaded"][lang]}
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
487
488
489
490
        for elem_id, value in config_dict.items():
            output_dict[self.manager.get_elem_by_id(elem_id)] = value

        return output_dict
chenych's avatar
chenych committed
491
492

    def check_output_dir(self, lang: str, model_name: str, finetuning_type: str, output_dir: str):
chenych's avatar
chenych committed
493
        r"""Restore the training status if output_dir exists."""
chenych's avatar
chenych committed
494
        output_box = self.manager.get_elem_by_id("train.output_box")
chenych's avatar
chenych committed
495
        output_dict: dict[Component, Any] = {output_box: LOCALES["output_box"][lang]["value"]}
chenych's avatar
chenych committed
496
497
498
499
500
501
502
503
504
505
        if model_name and output_dir and os.path.isdir(get_save_dir(model_name, finetuning_type, output_dir)):
            gr.Warning(ALERTS["warn_output_dir_exists"][lang])
            output_dict[output_box] = ALERTS["warn_output_dir_exists"][lang]

            output_dir = get_save_dir(model_name, finetuning_type, output_dir)
            config_dict = load_args(os.path.join(output_dir, LLAMABOARD_CONFIG))  # load llamaboard config
            for elem_id, value in config_dict.items():
                output_dict[self.manager.get_elem_by_id(elem_id)] = value

        return output_dict