convert.py

#!/usr/bin/env python3
# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import yaml
import os
import subprocess
import onnx

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='generate config.pbtxt for model_repo')
    parser.add_argument('--config', required=True, help='config file')
    parser.add_argument('--vocab', required=True,
                        help='vocabulary file, units.txt')
    parser.add_argument('--model_repo', required=True,
                        help='model repo directory')
    parser.add_argument('--onnx_model_dir', default=True, type=str, required=False,
                        help="onnx model path")
    parser.add_argument('--lm_path', default=None, type=str, required=False,
                        help="the additional language model path")
    args = parser.parse_args()
    with open(args.config, 'r') as fin:
        configs = yaml.load(fin, Loader=yaml.FullLoader)

    with open(os.path.join(args.onnx_model_dir, 'config.yaml'), 'r') as fin:
        onnx_configs = yaml.load(fin, Loader=yaml.FullLoader)

    params = [("#beam_size", 10), ("#num_mel_bins", 80), ("#frame_shift", 10),
              ("#frame_length", 25), ("#sample_rate", 16000), ("#output_size", 256),
              ("#lm_path", ""), ("#bidecoder", 0), ("#vocabulary_path", ""),
              ("#DTYPE", "FP32")]
    model_params = dict(params)
    # fill values
    model_params["#beam_size"] = onnx_configs["beam_size"]
    if onnx_configs["fp16"]:
        model_params["#DTYPE"] = "FP16"
    feature_conf = configs["dataset_conf"]["fbank_conf"]
    model_params["#num_mel_bins"] = feature_conf["num_mel_bins"]
    model_params["#frame_shift"] = feature_conf["frame_shift"]
    model_params["#frame_length"] = feature_conf["frame_length"]
    dataset_conf = configs["dataset_conf"]["resample_conf"]
    model_params["#sample_rate"] = dataset_conf["resample_rate"]
    model_params["#output_size"] = configs["encoder_conf"]["output_size"]
    model_params["#encoder_output_size"] = model_params["#output_size"]
    model_params["#lm_path"] = args.lm_path
    if configs["decoder"].startswith("bi"):
        model_params["#bidecoder"] = 1
    model_params["#vocabulary_path"] = args.vocab
    model_params["#vocab_size"] = configs["output_dim"]

    streaming = "decoding_window" in onnx_configs
    if streaming:
        # add streaming model parameters
        chunk_size = onnx_configs["decoding_chunk_size"]
        num_left_chunks = onnx_configs["num_decoding_left_chunks"]
        cache_size = chunk_size * num_left_chunks
        model_params["#cache_size"] = cache_size
        subsampling_rate = onnx_configs["subsampling_rate"]
        frame_shift = model_params["#frame_shift"]
        chunk_seconds = (chunk_size * subsampling_rate * frame_shift) / 1000
        model_params["#chunk_size_in_seconds"] = chunk_seconds
        model_params["#num_layers"] = configs["encoder_conf"]["num_blocks"]
        model_params["#context"] = onnx_configs["context"]
        model_params["#cnn_module_cache"] = onnx_configs["cnn_module_kernel_cache"]
        model_params["#decoding_window"] = onnx_configs["decoding_window"]
        head = configs["encoder_conf"]["attention_heads"]
        model_params["#num_head"] = head
        d_k = configs["encoder_conf"]["output_size"] // head
        model_params["#att_cache_output_size"] = d_k * 2

    for model in os.listdir(args.model_repo):
        template = "config_template.pbtxt"
        # non u2++ decoder
        if "decoder" == model and model_params["#bidecoder"] == 0:
            template = "config_template2.pbtxt"
        # streaming transformer encoder
        if "encoder" == model and model_params.get("#cnn_module_cache", -1) == 0:
            template = "config_template2.pbtxt"

        model_dir = os.path.join(args.model_repo, model)
        out = os.path.join(model_dir, "config.pbtxt")
        out = open(out, "w")

        if model in ("decoder", "encoder"):
            if onnx_configs["fp16"]:
                model_name = model + "_fp16.onnx"
            else:
                model_name = model + ".onnx"
            source_model = os.path.join(args.onnx_model_dir, model_name)
            target_model = os.path.join(model_dir, "1", model + ".onnx")
            res = subprocess.call(
                ["cp", source_model, target_model], shell=False)
            if model == "encoder":
                # currently, with torch 1.10, the
                # exported conformer encoder output size is -1
                # Solution: Please upgrade your torch version
                # torch version >= 1.11.0 should fix this issue
                model = onnx.load(source_model)
                if streaming:
                    encoder_out = model.graph.output[2]
                else:
                    encoder_out = model.graph.output[0]
                output_dim = encoder_out.type.tensor_type.shape.dim[2].dim_param
                if output_dim.startswith("Add"):
                    model_params["#encoder_output_size"] = -1

        with open(os.path.join(model_dir, template), "r", encoding="utf-8") as f:
            for line in f:
                if line.startswith("#"):
                    continue
                for key, value in model_params.items():
                    line = line.replace(key, str(value))
                out.write(line)
        out.close()