lite.py 3.67 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# Copyright (c) OpenMMLab. All rights reserved.


class SubCliLite(object):
    """CLI for compressing LLMs."""

    def auto_awq(self,
                 model: str,
                 work_dir: str,
                 w_bits: int = 4,
                 w_sym: bool = False,
                 w_group_size: int = 128,
                 device: str = 'cuda'):
        """Perform weight quantization using AWQ algorithm.

        Args:
            model (str): The path of model in hf format.
            work_dir (str): The working directory to save results.
            w_bits (int): Bit number for weight quantization.
            w_sym (bool): Whether to do symmetric quantization.
            w_group_size (int): Group size for weight quantization statistics.
            device (str): Device type of running.
        """
        from lmdeploy.lite.apis.auto_awq import auto_awq

        auto_awq(model,
                 work_dir,
                 w_bits=w_bits,
                 w_sym=w_sym,
                 w_group_size=w_group_size,
                 device=device)

    def calibrate(self,
                  model: str,
                  calib_dataset: str = 'c4',
                  calib_samples: int = 128,
                  calib_seqlen: int = 2048,
                  work_dir: str = './work_dir',
                  device: str = 'cuda') -> None:
        """Perform calibration on a given dataset.

        Args:
            model (str): The model to be loaded.
            calib_dataset (str, optional): The calibration dataset name.
                Defaults to 'c4'.
            calib_samples (int, optional): The number of samples for
                calibration. Defaults to 128.
            calib_seqlen (int, optional): The sequence length for calibration.
                Defaults to 2048.
            work_dir (str): The working directory for outputs.
                Defaults to './work_dir'.
            device (str, optional): The device to be used for calculation.
                Defaults to 'cuda'.
        """
        from lmdeploy.lite.apis.calibrate import calibrate

        calibrate(model,
                  calib_dataset=calib_dataset,
                  calib_samples=calib_samples,
                  calib_seqlen=calib_seqlen,
                  work_dir=work_dir,
                  device=device)

    def kv_qparams(self,
                   work_dir: str,
                   turbomind_dir: str,
                   kv_bits: int = 8,
                   kv_sym: bool = False,
                   num_tp: int = 1) -> None:
        """Export key and value stats.

        Args:
            work_dir (str): Directory path where the stats
                are saved.
            turbomind_dir (str): Directory path where to
                save the results.
            kv_bits (int, optional): Number of bits for quantization.
                Defaults to 8.
            kv_sym (bool, optional): Whether to use symmetric quantization.
                Defaults to False.
            num_tp (int, optional): Number of tensor parallelism.
                Defaults to 1.
        """
        from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams

        run_kv_qparams(work_dir,
                       turbomind_dir,
                       kv_bits=kv_bits,
                       kv_sym=kv_sym,
                       num_tp=num_tp)

    def get_small_sharded_hf(self, src_dir: str, dst_dir: str):
        """Convert a hugging face model to the smallest sharded one.

        Args:
            src_dir (str): The directory of the input HF model.
            dst_dir (str): The directory to save new  model.
        """
        from lmdeploy.lite.apis.get_small_sharded_hf import main as run_sharded
        run_sharded(src_dir, dst_dir)