save_sharded_state.py 2.53 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
"""
Saves each worker's model state dict directly to a checkpoint, which enables a
fast load path for large tensor-parallel models where each worker only needs to
read its own shard rather than the entire checkpoint.

Example usage:

python save_sharded_state.py \
    --model /path/to/load \
    --tensor-parallel-size 8 \
    --output /path/to/save

Then, the model can be loaded with

llm = LLM(
    model="/path/to/save",
    load_format="sharded_state",
    tensor_parallel_size=8,
)
"""
23

24
25
26
27
28
29
import dataclasses
import os
import shutil
from pathlib import Path

from vllm import LLM, EngineArgs
30
from vllm.model_executor.model_loader import ShardedStateLoader
31
from vllm.utils.argparse_utils import FlexibleArgumentParser
32

33
34
35
36

def parse_args():
    parser = FlexibleArgumentParser()
    EngineArgs.add_cli_args(parser)
37
38
39
40
    parser.add_argument(
        "--output", "-o", required=True, type=str, help="path to output checkpoint"
    )
    parser.add_argument(
41
42
43
44
        "--file-pattern",
        type=str,
        default=ShardedStateLoader.DEFAULT_PATTERN,
        help="string pattern of saved filenames",
45
46
47
    )
    parser.add_argument(
        "--max-file-size",
48
        type=int,
49
50
51
        default=5 * 1024**3,
        help="max size (in bytes) of each safetensors file",
    )
52
    return parser.parse_args()
53
54
55
56
57
58
59
60
61
62
63
64
65
66


def main(args):
    engine_args = EngineArgs.from_cli_args(args)
    if engine_args.enable_lora:
        raise ValueError("Saving with enable_lora=True is not supported!")
    model_path = engine_args.model
    if not Path(model_path).is_dir():
        raise ValueError("model path must be a local directory")
    # Create LLM instance from arguments
    llm = LLM(**dataclasses.asdict(engine_args))
    # Prepare output directory
    Path(args.output).mkdir(exist_ok=True)
    # Dump worker states to output directory
67

68
69
70
    llm.llm_engine.engine_core.save_sharded_state(
        path=args.output, pattern=args.file_pattern, max_size=args.max_file_size
    )
71

72
73
74
75
    # Copy metadata files to output directory
    for file in os.listdir(model_path):
        if os.path.splitext(file)[1] not in (".bin", ".pt", ".safetensors"):
            if os.path.isdir(os.path.join(model_path, file)):
76
77
78
                shutil.copytree(
                    os.path.join(model_path, file), os.path.join(args.output, file)
                )
79
80
81
82
83
            else:
                shutil.copy(os.path.join(model_path, file), args.output)


if __name__ == "__main__":
84
    args = parse_args()
85
    main(args)