misc.py 9.16 KB
Newer Older
chenych's avatar
chenych committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
#
# This code is inspired by the HuggingFace's PEFT library.
# https://github.com/huggingface/peft/blob/v0.10.0/src/peft/peft_model.py
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
18
19
import gc
import os
luopl's avatar
luopl committed
20
from typing import TYPE_CHECKING, Any, Dict, Literal, Sequence, Tuple, Union
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
21
22

import torch
luopl's avatar
luopl committed
23
import torch.distributed as dist
chenych's avatar
chenych committed
24
25
26
import transformers.dynamic_module_utils
from transformers import InfNanRemoveLogitsProcessor, LogitsProcessorList
from transformers.dynamic_module_utils import get_relative_imports
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
27
28
29
30
31
32
33
34
35
from transformers.utils import (
    is_torch_bf16_gpu_available,
    is_torch_cuda_available,
    is_torch_mps_available,
    is_torch_npu_available,
    is_torch_xpu_available,
)
from transformers.utils.versions import require_version

luopl's avatar
luopl committed
36
from . import logging
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
37
38
39
40


_is_fp16_available = is_torch_npu_available() or is_torch_cuda_available()
try:
chenych's avatar
chenych committed
41
    _is_bf16_available = is_torch_bf16_gpu_available() or (is_torch_npu_available() and torch.npu.is_bf16_supported())
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
42
43
44
45
46
except Exception:
    _is_bf16_available = False


if TYPE_CHECKING:
chenych's avatar
chenych committed
47
    from numpy.typing import NDArray
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
48

chenych's avatar
chenych committed
49
    from ..hparams import ModelArguments
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
50
51


luopl's avatar
luopl committed
52
logger = logging.get_logger(__name__)
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75


class AverageMeter:
    r"""
    Computes and stores the average and current value.
    """

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


luopl's avatar
luopl committed
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def check_version(requirement: str, mandatory: bool = False) -> None:
    r"""
    Optionally checks the package version.
    """
    if os.getenv("DISABLE_VERSION_CHECK", "0").lower() in ["true", "1"] and not mandatory:
        logger.warning_rank0_once("Version checking has been disabled, may lead to unexpected behaviors.")
        return

    if mandatory:
        hint = f"To fix: run `pip install {requirement}`."
    else:
        hint = f"To fix: run `pip install {requirement}` or set `DISABLE_VERSION_CHECK=1` to skip this check."

    require_version(requirement, hint)


Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
92
def check_dependencies() -> None:
chenych's avatar
chenych committed
93
94
95
    r"""
    Checks the version of the required packages.
    """
luopl's avatar
luopl committed
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
    check_version("transformers>=4.41.2,<=4.46.1")
    check_version("datasets>=2.16.0,<=3.1.0")
    check_version("accelerate>=0.34.0,<=1.0.1")
    check_version("peft>=0.11.1,<=0.12.0")
    check_version("trl>=0.8.6,<=0.9.6")


def calculate_tps(dataset: Sequence[Dict[str, Any]], metrics: Dict[str, float], stage: Literal["sft", "rm"]) -> float:
    r"""
    Calculates effective tokens per second.
    """
    effective_token_num = 0
    for data in dataset:
        if stage == "sft":
            effective_token_num += len(data["input_ids"])
        elif stage == "rm":
            effective_token_num += len(data["chosen_input_ids"]) + len(data["rejected_input_ids"])

    result = effective_token_num * metrics["epoch"] / metrics["train_runtime"]
    return result / dist.get_world_size() if dist.is_initialized() else result
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
116
117


chenych's avatar
chenych committed
118
def count_parameters(model: "torch.nn.Module") -> Tuple[int, int]:
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
119
120
121
122
123
124
125
126
127
128
    r"""
    Returns the number of trainable parameters and number of all parameters in the model.
    """
    trainable_params, all_param = 0, 0
    for param in model.parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

chenych's avatar
chenych committed
129
        # Due to the design of 4bit linear layers from bitsandbytes, multiply the number of parameters by itemsize
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
        if param.__class__.__name__ == "Params4bit":
            if hasattr(param, "quant_storage") and hasattr(param.quant_storage, "itemsize"):
                num_bytes = param.quant_storage.itemsize
            elif hasattr(param, "element_size"):  # for older pytorch version
                num_bytes = param.element_size()
            else:
                num_bytes = 1

            num_params = num_params * 2 * num_bytes

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params

    return trainable_params, all_param


chenych's avatar
chenych committed
147
def get_current_device() -> "torch.device":
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
    r"""
    Gets the current available device.
    """
    if is_torch_xpu_available():
        device = "xpu:{}".format(os.environ.get("LOCAL_RANK", "0"))
    elif is_torch_npu_available():
        device = "npu:{}".format(os.environ.get("LOCAL_RANK", "0"))
    elif is_torch_mps_available():
        device = "mps:{}".format(os.environ.get("LOCAL_RANK", "0"))
    elif is_torch_cuda_available():
        device = "cuda:{}".format(os.environ.get("LOCAL_RANK", "0"))
    else:
        device = "cpu"

    return torch.device(device)


def get_device_count() -> int:
    r"""
chenych's avatar
chenych committed
167
    Gets the number of available GPU or NPU devices.
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
168
    """
chenych's avatar
chenych committed
169
170
171
172
173
174
175
    if is_torch_xpu_available():
        return torch.xpu.device_count()
    elif is_torch_npu_available():
        return torch.npu.device_count()
    elif is_torch_cuda_available():
        return torch.cuda.device_count()
    else:
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
176
177
178
179
180
181
182
183
184
185
186
187
        return 0


def get_logits_processor() -> "LogitsProcessorList":
    r"""
    Gets logits processor that removes NaN and Inf logits.
    """
    logits_processor = LogitsProcessorList()
    logits_processor.append(InfNanRemoveLogitsProcessor())
    return logits_processor


luopl's avatar
luopl committed
188
189
190
191
192
193
194
195
196
197
198
199
def get_peak_memory() -> Tuple[int, int]:
    r"""
    Gets the peak memory usage for the current device (in Bytes).
    """
    if is_torch_npu_available():
        return torch.npu.max_memory_allocated(), torch.npu.max_memory_reserved()
    elif is_torch_cuda_available():
        return torch.cuda.max_memory_allocated(), torch.cuda.max_memory_reserved()
    else:
        return 0, 0


chenych's avatar
chenych committed
200
201
202
203
204
205
206
207
def has_tokenized_data(path: "os.PathLike") -> bool:
    r"""
    Checks if the path has a tokenized dataset.
    """
    return os.path.isdir(path) and len(os.listdir(path)) > 0


def infer_optim_dtype(model_dtype: "torch.dtype") -> "torch.dtype":
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
208
209
210
211
212
213
214
215
216
217
218
    r"""
    Infers the optimal dtype according to the model_dtype and device compatibility.
    """
    if _is_bf16_available and model_dtype == torch.bfloat16:
        return torch.bfloat16
    elif _is_fp16_available:
        return torch.float16
    else:
        return torch.float32


chenych's avatar
chenych committed
219
def is_gpu_or_npu_available() -> bool:
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
220
    r"""
chenych's avatar
chenych committed
221
    Checks if the GPU or NPU is available.
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
222
    """
chenych's avatar
chenych committed
223
224
225
226
    return is_torch_npu_available() or is_torch_cuda_available()


def numpify(inputs: Union["NDArray", "torch.Tensor"]) -> "NDArray":
luopl's avatar
luopl committed
227
228
229
    r"""
    Casts a torch tensor or a numpy array to a numpy array.
    """
chenych's avatar
chenych committed
230
231
232
233
234
235
236
237
238
239
240
    if isinstance(inputs, torch.Tensor):
        inputs = inputs.cpu()
        if inputs.dtype == torch.bfloat16:  # numpy does not support bfloat16 until 1.21.4
            inputs = inputs.to(torch.float32)

        inputs = inputs.numpy()

    return inputs


def skip_check_imports() -> None:
luopl's avatar
luopl committed
241
242
243
    r"""
    Avoids flash attention import error in custom model files.
    """
luopl's avatar
luopl committed
244
    if os.getenv("FORCE_CHECK_IMPORTS", "0").lower() not in ["true", "1"]:
chenych's avatar
chenych committed
245
        transformers.dynamic_module_utils.check_imports = get_relative_imports
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
246
247
248
249


def torch_gc() -> None:
    r"""
chenych's avatar
chenych committed
250
    Collects GPU or NPU memory.
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
251
252
    """
    gc.collect()
chenych's avatar
chenych committed
253
254
255
256
257
258
259
    if is_torch_xpu_available():
        torch.xpu.empty_cache()
    elif is_torch_npu_available():
        torch.npu.empty_cache()
    elif is_torch_mps_available():
        torch.mps.empty_cache()
    elif is_torch_cuda_available():
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
260
261
262
        torch.cuda.empty_cache()


luopl's avatar
luopl committed
263
264
def try_download_model_from_other_hub(model_args: "ModelArguments") -> str:
    if (not use_modelscope() and not use_openmind()) or os.path.exists(model_args.model_name_or_path):
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
265
266
        return model_args.model_name_or_path

luopl's avatar
luopl committed
267
    if use_modelscope():
luopl's avatar
luopl committed
268
        check_version("modelscope>=1.11.0", mandatory=True)
luopl's avatar
luopl committed
269
        from modelscope import snapshot_download  # type: ignore
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
270
271

        revision = "master" if model_args.model_revision == "main" else model_args.model_revision
luopl's avatar
luopl committed
272
273
274
275
276
277
278
        return snapshot_download(
            model_args.model_name_or_path,
            revision=revision,
            cache_dir=model_args.cache_dir,
        )

    if use_openmind():
luopl's avatar
luopl committed
279
        check_version("openmind>=0.8.0", mandatory=True)
luopl's avatar
luopl committed
280
281
282
283
284
285
286
        from openmind.utils.hub import snapshot_download  # type: ignore

        return snapshot_download(
            model_args.model_name_or_path,
            revision=model_args.model_revision,
            cache_dir=model_args.cache_dir,
        )
Rayyyyy's avatar
V0.6.3  
Rayyyyy committed
287
288
289


def use_modelscope() -> bool:
luopl's avatar
luopl committed
290
    return os.getenv("USE_MODELSCOPE_HUB", "0").lower() in ["true", "1"]
luopl's avatar
luopl committed
291
292
293


def use_openmind() -> bool:
luopl's avatar
luopl committed
294
    return os.getenv("USE_OPENMIND_HUB", "0").lower() in ["true", "1"]
luopl's avatar
luopl committed
295
296


luopl's avatar
luopl committed
297
298
def use_ray() -> bool:
    return os.getenv("USE_RAY", "0").lower() in ["true", "1"]