arguments.py 2.77 KB
Newer Older
wangxj's avatar
wangxj committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.


def add_modelopt_args(parser):
    """Add additional arguments for using TensorRT Model Optimizer (modelopt) features."""
    group = parser.add_argument_group(title="modelopt-generic")

    # Model and Checkpoint Compatibility
    group.add_argument(
        "--export-model-type",
        type=str,
        default="GPTModel",
        choices=["GPTModel", "MambaModel"],
        help="Model type to use in model_provider.",
    )
    group.add_argument(
        "--export-legacy-megatron",
        action="store_true",
        help="Export a legacy megatron-lm checkpoint.",
    )
    group.add_argument(
        "--export-te-mcore-model",
        action="store_true",
        help="Export a megatron-core transformer-engine checkpoint.",
    )
    group.add_argument(
        "--export-force-local-attention",
        action="store_true",
        help="Forcing local DotProductAttention; otherwise TEDotProductAttention is used.",
    )

    # Quantization
    group.add_argument(
        "--export-kv-cache-quant",
        action="store_true",
        help="Whether or not to perform KV-cache quantization.",
    )
    group.add_argument(
        "--export-real-quant-cfg",
        type=str,
        default="None",
        choices=["fp8_real_quant", "fp8_blockwise_real_quant", "None"],
        help="Specify a real quantization config from the supported choices.",
    )
    group.add_argument(
        "--export-quant-cfg",
        type=str,
        default=None,
        choices=["int8", "int8_sq", "fp8", "fp8_real_quant", "fp8_blockwise", "fp8_blockwise_real_quant", "int4_awq", "w4a8_awq", "int4", "fp4", "None"],
        help="Specify a quantization config from the supported choices.",
    )

    # Knowledge Distillation
    group.add_argument(
        '--export-kd-cfg',
        type=str,
        default=None,
        help='Path to distillation configuration yaml file.',
    )
    group.add_argument(
        '--export-kd-teacher-load',
        type=str,
        help='Path to checkpoint to load as distillation teacher.',
    )
    group.add_argument(
        '--export-kd-finalize',
        action="store_true",
        help='Export original student class back from a loaded distillation model.',
    )

    # Speculative decoding
    group.add_argument(
        '--export-num-medusa-heads',
        type=int,
        default=0,
        help='Number of Medusa heads for speculative decoding.',
    )
    group.add_argument(
        '--export-num-eagle-layers',
        type=int,
        default=0,
        help='Number of EAGLE layers for speculative decoding.',
    )

    # Finetuning
    group.add_argument(
        "--finetune-hf-dataset",
        type=str,
        default=None,
        help="HF dataset used for finetuning."
    )


    return parser