infer_io.py

import torch
from torch.utils.data import DataLoader, Dataset
import soundfile
import time
import numpy as np
import os
import multiprocessing
import argparse
from typing import Dict, Optional, Tuple

from espnet2.bin.asr_inference import Speech2Text
from espnet2.torch_utils.device_funcs import to_device
torch.set_num_threads(1)
try:
    from swig_decoders import map_batch, \
        ctc_beam_search_decoder_batch, \
        TrieVector, PathTrie
except ImportError:
    print('Please install ctc decoders first by refering to\n' +
          'https://github.com/Slyne/ctc_decoder.git')
    sys.exit(1)


class CustomAishellDataset(Dataset):
    def __init__(self, wav_scp_file, text_file):

        with open(wav_scp_file,'r') as wav_scp, open(text_file,'r') as text:
            wavs = wav_scp.readlines()
            texts = text.readlines()

        self.wav_names = [item.split()[0] for item in wavs]
        self.wav_paths = [item.split()[1] for item in wavs]
        self.labels = ["".join(item.split()[1:]) for item in texts]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        speech,sr = soundfile.read(self.wav_paths[idx])
        assert sr==16000, sr
        speech = np.array(speech, dtype=np.float32)
        speech_len = speech.shape[0]
        label = self.labels[idx]
        name = self.wav_names[idx]
        return speech, speech_len, label, name


def collate_wrapper(batch):
    speeches = np.zeros((len(batch), 16000 * 30),dtype=np.float32)
    lengths = np.zeros(len(batch),dtype=np.int64)
    labels = []
    names = []
    for i, (speech, speech_len, label, name) in enumerate(batch):
        speeches[i,:speech_len] = speech
        lengths[i] = speech_len
        labels.append(label)
        names.append(name)
    speeches = speeches[:,:max(lengths)]
    return speeches, lengths, labels, names

def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
    """Make mask tensor containing indices of padded part.
    See description of make_non_pad_mask.
    Args:
        lengths (torch.Tensor): Batch of lengths (B,).
    Returns:
        torch.Tensor: Mask tensor containing indices of padded part.
    Examples:
        >>> lengths = [5, 3, 2]
        >>> make_pad_mask(lengths)
        masks = [[0, 0, 0, 0 ,0],
                 [0, 0, 0, 1, 1],
                 [0, 0, 1, 1, 1]]
    """
    batch_size = lengths.size(0)
    max_len = max_len if max_len > 0 else lengths.max().item()
    seq_range = torch.arange(0,
                             max_len,
                             dtype=torch.int64,
                             device=lengths.device)
    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
    seq_length_expand = lengths.unsqueeze(-1)
    mask = seq_range_expand >= seq_length_expand
    return mask

def get_args():
    parser = argparse.ArgumentParser(description='recognize with your model')
    parser.add_argument('--config', required=True, help='config file')
    parser.add_argument('--lm_config', required=True, help='config file')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='gpu id for this rank, -1 for cpu')
    parser.add_argument('--wav_scp', required=True, help='wav scp file')
    parser.add_argument('--text', required=True, help='ground truth text file')
    parser.add_argument('--model_path', required=True, help='torch pt model file')
    parser.add_argument('--lm_path', required=True, help='torch pt model file')
    parser.add_argument('--result_file', default='./predictions.txt', help='asr result file')
    parser.add_argument('--log_file', default='./rtf.txt', help='asr decoding log')
    parser.add_argument('--batch_size',
                        type=int,
                        default=24,
                        help='batch_size')
    parser.add_argument('--beam_size',
                        type=int,
                        default=10,
                        help='beam_size')
    parser.add_argument('--mode',
                        choices=[
                            'ctc_greedy_search', 'ctc_prefix_beam_search',
                            'attention_rescoring', 'attention_lm_rescoring', 'lm_rescoring'],
                        default='attention_lm_rescoring',
                        help='decoding mode')

    args = parser.parse_args()
    return args

if __name__ == '__main__':
    args = get_args()
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)    
    dataset = CustomAishellDataset(args.wav_scp, args.text)
    # test_data_loader = DataLoader(dataset, batch_size=args.batch_size,
    #                               collate_fn=collate_wrapper)
    test_data_loader = DataLoader(dataset, batch_size=args.batch_size,
                                  collate_fn=collate_wrapper)
    speech2text = Speech2Text(
        args.config,
        args.model_path,
        None,
        args.lm_config,
        args.lm_path,
        device="cuda"
    )
    
    # 手动加载完整的ESPnetLanguageModel对象
    # 因为Speech2Text中只存储了原始语言模型，我们需要完整的对象来使用batchify_nll方法
    full_lm_model = None
    if args.lm_config is not None and args.lm_path is not None:
        from espnet2.tasks.lm import LMTask
        full_lm_model, _ = LMTask.build_model_from_file(
            args.lm_config, args.lm_path, "cuda"
        )
        full_lm_model.eval()
    import onnxruntime as ort

    sess_options = ort.SessionOptions()
    sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
    sess_options.enable_cpu_mem_arena = False
    sess_options.enable_mem_pattern = False

    providers = ['ROCMExecutionProvider']
    encoder_path = "/home/sunzhq/workspace/yidong-infer/conformer/onnx_models_batch24_1/transformer_lm/full/default_encoder_fp16.onnx"
    encoder_session = ort.InferenceSession(encoder_path, 
                                           providers=providers)
    encoder_session_io = encoder_session.io_binding()
    
    output_names = ["encoder_out", "encoder_out_lens"]

    time_start = time.perf_counter()
    audio_sample_len = 0
    encoder_times = []
    ctc_times = []
    decoder_times = []
    lm_times = []
    beam_search_times = []
    count_times = []
    with torch.no_grad(), open(args.result_file, 'w') as fout:
        for _, batch in enumerate(test_data_loader):
            speech, speech_lens, labels, names = batch
            audio_sample_len += np.sum(speech_lens) / 16000
            batch = {"speech": speech, "speech_lengths": speech_lens}
            
            if isinstance(batch["speech"], np.ndarray):
                batch["speech"] = torch.tensor(batch["speech"])
            if isinstance(batch["speech_lengths"], np.ndarray):
                batch["speech_lengths"] = torch.tensor(batch["speech_lengths"])
                      
            
            batch = to_device(batch, device='cuda')
            feats, encoder_out_lens = speech2text.asr_model.pre_data(**batch)
            encoder_out_lens = torch.ceil(encoder_out_lens.float() / 4).long()
            encoder_inputs = {'feats': feats.cpu().numpy().astype(np.float32)}
            inputData = {}
            for key in encoder_inputs.keys():
                inputData[key] = ort.OrtValue.ortvalue_from_numpy(encoder_inputs[key], device_type='cuda')
                encoder_session_io.bind_input(
                    name = key,
                    device_type = inputData[key].device_name(),
                    device_id = 0,
                    element_type = np.float32,
                    shape = inputData[key].shape(),
                    buffer_ptr = inputData[key].data_ptr())
            

            # for o_n in output_names:
            # encoder_session_io.bind_output("encoder_out")
            encoder_session_io.bind_output(name="encoder_out", device_type="cuda", device_id=0)
            ll_time = time.time()
            encoder_session.run_with_iobinding(encoder_session_io)
            outputs = encoder_session_io.get_outputs()[0]
            ptr = outputs.data_ptr()          # GPU 内存地址
            shape = outputs.shape()
            dtype = torch.float32
            total_elements = np.prod(shape)
            element_size = 4
            total_bytes = total_elements * element_size
            methods = [m for m in dir(outputs) if not m.startswith('_')]
            print(methods)

            # encoder_out = torch.as_tensor(ptr, dtype=dtype, device='cuda').reshape(shape)
            print(outputs)
            print(outputs.device_name())
            print("Has to_dlpack:", hasattr(outputs, 'to_dlpack'))
            print("Shape:", outputs.shape())
            # encoder_out = torch.from_dlpack(outputs[0].to_dlpack())
            # result = encoder_session_io.copy_outputs_to_cpu()
            # encoder_out = torch.tensor(result[0]).float().cuda()
            print(encoder_out)

            
            # # encoder_time = time.time()
            # encoder_outputs = encoder_session.run(None, encoder_inputs)
            # # encoder_out_1, encoder_out_lens_1 = encoder_session_io.get_outputs()
            # encoder_out_numpy = encoder_outputs[0]
            # # encoder_out_lens = np.array(encoder_session_io.copy_outputs_to_cpu()[1])
            # encoder_out = torch.from_numpy(encoder_out_numpy).float().cuda()
            # print(encoder_out.size())
            
            ctc_log_probs = torch.nn.functional.log_softmax(
                speech2text.asr_model.ctc.ctc_lo(encoder_out), dim=2
            )
            
            beam_log_probs, beam_log_probs_idx = torch.topk(ctc_log_probs, 
                                                            args.beam_size, dim=2)
            
            # ctc_count = time.time() - ctc_time
            # print("ctc 耗时：", ctc_count)
            # ctc_times.append(ctc_count)
            num_processes = min(multiprocessing.cpu_count(), args.batch_size)
            
            if args.mode == 'ctc_greedy_search':
                assert args.beam_size != 1
                log_probs_idx = beam_log_probs_idx[:, :, 0]
                batch_sents = []
                for idx, seq in enumerate(log_probs_idx):
                    batch_sents.append(seq[0:encoder_out_lens[idx]].tolist())
                hyps = map_batch(batch_sents, speech2text.asr_model.token_list,
                                 num_processes, True, 0)
            else:
                # beam_search_time = time.time()
                
                batch_log_probs_seq_list = beam_log_probs.tolist()
                batch_log_probs_idx_list = beam_log_probs_idx.tolist()
                batch_len_list = encoder_out_lens.tolist()
                # batch_len_list = encoder_out_lens
                batch_log_probs_seq = []
                batch_log_probs_ids = []
                batch_start = []  # only effective in streaming deployment
                batch_root = TrieVector()
                root_dict = {}
                for i in range(len(batch_len_list)):
                    # print(batch_len_list)
                    # num_sent = batch_len_list[i]
                    num_sent = encoder_out.size()[1]
                    batch_log_probs_seq.append(
                        batch_log_probs_seq_list[i][0:num_sent])
                    batch_log_probs_ids.append(
                        batch_log_probs_idx_list[i][0:num_sent])
                    root_dict[i] = PathTrie()
                    batch_root.append(root_dict[i])
                    batch_start.append(True)
                score_hyps = ctc_beam_search_decoder_batch(batch_log_probs_seq,
                                                           batch_log_probs_ids,
                                                           batch_root,
                                                           batch_start,
                                                           args.beam_size,
                                                           num_processes,
                                                           0, -2, 0.99999)
                
                # beam_search_count = time.time() - beam_search_time
                # print("beam_search 耗时：", beam_search_count)
                # beam_search_times.append(beam_search_count)
                # beam_log_probs, beam_log_probs_idx = torch.topk(ctc_log_probs, 
                #                                                 args.beam_size, dim=2)

                if args.mode == 'ctc_prefix_beam_search':
                    hyps = []
                    for cand_hyps in score_hyps:
                        hyps.append(cand_hyps[0][1])
                    hyps = map_batch(hyps, speech2text.asr_model.token_list, num_processes, False, 0)

                elif args.mode == 'attention_rescoring':
                    ctc_score, all_hyps = [], []
                    max_len = 0
                    for hyps in score_hyps:
                        cur_len = len(hyps)
                        if len(hyps) < args.beam_size:
                            hyps += (args.beam_size - cur_len) * [(-float("INF"), (0,))]
                        cur_ctc_score = []
                        for hyp in hyps:
                            cur_ctc_score.append(hyp[0])
                            all_hyps.append(list(hyp[1]))
                            if len(hyp[1]) > max_len:
                                max_len = len(hyp[1])
                        ctc_score.append(cur_ctc_score)
                
                    ctc_score = torch.tensor(ctc_score, dtype=torch.float32)
                    hyps_pad_sos_eos = torch.ones(
                        (args.batch_size, args.beam_size, max_len + 2), dtype=torch.int64) * speech2text.asr_model.ignore_id # FIXME: ignore id
                    hyps_pad_sos = torch.ones(
                        (args.batch_size, args.beam_size, max_len + 1), dtype=torch.int64) * speech2text.asr_model.eos # FIXME: eos
                    hyps_pad_eos = torch.ones(
                        (args.batch_size, args.beam_size, max_len + 1), dtype=torch.int64) * speech2text.asr_model.ignore_id # FIXME: ignore id
                    hyps_lens_sos = torch.ones((args.batch_size, args.beam_size), dtype=torch.int32)
                    k = 0
                    for i in range(args.batch_size):
                        for j in range(args.beam_size):
                            cand = all_hyps[k]
                            l = len(cand) + 2
                            hyps_pad_sos_eos[i][j][0:l] = torch.tensor([speech2text.asr_model.sos] + cand + [speech2text.asr_model.eos])
                            hyps_pad_sos[i][j][0:l-1] = torch.tensor([speech2text.asr_model.sos] + cand)
                            hyps_pad_eos[i][j][0:l-1] = torch.tensor(cand + [speech2text.asr_model.eos])
                            hyps_lens_sos[i][j] = len(cand) + 1
                            k += 1

                    bz = args.beam_size
                    B,T,F = encoder_out.shape
                    B2=B*bz
                    encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F)
                    encoder_out_lens = encoder_out_lens.repeat(bz)

                    hyps_pad = hyps_pad_sos_eos.view(B2, max_len + 2)
                    hyps_lens = hyps_lens_sos.view(B2,)
                    hyps_pad_sos = hyps_pad_sos.view(B2, max_len + 1)
                    hyps_pad_eos = hyps_pad_eos.view(B2, max_len + 1)
                    #hyps_pad_sos = hyps_pad[:, :-1]
                    #hyps_pad_eos = hyps_pad[:, 1:]
           
             
                    decoder_out, _ = speech2text.asr_model.decoder(encoder_out,encoder_out_lens,hyps_pad_sos.cuda(), hyps_lens.cuda())

                    decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1)

            
                    mask = ~make_pad_mask(hyps_lens, max_len+1)  # B2 x T2
                    # mask index, remove ignore id
                    index = torch.unsqueeze(hyps_pad_eos * mask, 2)
                    score = decoder_out.cpu().gather(2, index).squeeze(2)  # B2 X T2
                    # mask padded part
                    score = score * mask
                    # decoder_out = decoder_out.view(B, bz, max_len+1, -1)
                    score = torch.sum(score, axis=1)
                    score = torch.reshape(score,(B,bz))

                    all_scores = ctc_score + 0.1 * score # FIX ME need tuned
                    best_index = torch.argmax(all_scores, dim=1)

                    best_sents = []
                    k = 0
                    for idx in best_index:
                        cur_best_sent = all_hyps[k: k + args.beam_size][idx]
                        best_sents.append(cur_best_sent)
                        k += args.beam_size
                    hyps = map_batch(best_sents, speech2text.asr_model.token_list, num_processes)

                elif args.mode == 'attention_lm_rescoring':
                    ctc_score, all_hyps = [], []
                    max_len = 0
                    for hyps in score_hyps:
                        cur_len = len(hyps)
                        if len(hyps) < args.beam_size:
                            hyps += (args.beam_size - cur_len) * [(-float("INF"), (0,))]
                        cur_ctc_score = []
                        for hyp in hyps:
                            cur_ctc_score.append(hyp[0])
                            all_hyps.append(list(hyp[1]))
                            if len(hyp[1]) > max_len:
                                max_len = len(hyp[1])
                        ctc_score.append(cur_ctc_score)
                
                    ctc_score = torch.tensor(ctc_score, dtype=torch.float32)
                    hyps_pad = torch.ones(
                        (args.batch_size, args.beam_size, max_len), dtype=torch.int64) * speech2text.asr_model.ignore_id # FIXME: ignore id
                    hyps_lens = torch.ones((args.batch_size, args.beam_size), dtype=torch.int32)
                    k = 0
                    for i in range(args.batch_size):
                        for j in range(args.beam_size):
                            cand = all_hyps[k]
                            l = len(cand)
                            hyps_pad[i][j][0:l] = torch.tensor(cand)
                            hyps_lens[i][j] = len(cand)
                            k += 1

                    bz = args.beam_size
                    B,T,F = encoder_out.shape
                    B2=B*bz
                    encoder_out = encoder_out.repeat(1, bz, 1).view(B2, T, F)
                    encoder_out_lens = encoder_out_lens.repeat(bz)

                    hyps_pad = hyps_pad.view(B2, max_len).cuda()
                    hyps_lens = hyps_lens.view(B2,).cuda()
                   
                    decoder_scores = -speech2text.asr_model.batchify_nll(
                        encoder_out, encoder_out_lens, hyps_pad, hyps_lens, 320
                    )
                    decoder_scores = torch.reshape(decoder_scores,(B,bz)).cpu()
                 

                    hyps_pad[hyps_pad == speech2text.asr_model.ignore_id] = 0
                    nnlm_nll, x_lengths = full_lm_model.batchify_nll(hyps_pad, hyps_lens, 64)
                    nnlm_scores = -nnlm_nll.sum(dim=1)

                    nnlm_scores = torch.reshape(nnlm_scores,(B,bz)).cpu()

                    all_scores = ctc_score - 0.05 * decoder_scores + 1.0 * nnlm_scores # FIX ME need tuned
                    best_index = torch.argmax(all_scores, dim=1)

                    best_sents = []
                    k = 0
                    for idx in best_index:
                        cur_best_sent = all_hyps[k: k + args.beam_size][idx]
                        best_sents.append(cur_best_sent)
                        k += args.beam_size
                    hyps = map_batch(best_sents, speech2text.asr_model.token_list, num_processes)

                
                elif args.mode == 'lm_rescoring':
                    # lm_time = time.time()
                    
                    ctc_score, all_hyps = [], []
                    max_len = 0
                    for hyps in score_hyps:
                        cur_len = len(hyps)
                        if len(hyps) < args.beam_size:
                            hyps += (args.beam_size - cur_len) * [(-float("INF"), (0,))]
                        cur_ctc_score = []
                        for hyp in hyps:
                            cur_ctc_score.append(hyp[0])
                            all_hyps.append(list(hyp[1]))
                            if len(hyp[1]) > max_len:
                                max_len = len(hyp[1])
                        ctc_score.append(cur_ctc_score)
                
                    ctc_score = torch.tensor(ctc_score, dtype=torch.float32)
                    hyps_pad = torch.ones(
                        (args.batch_size, args.beam_size, max_len), dtype=torch.int64) * speech2text.asr_model.ignore_id # FIXME: ignore id
                    hyps_lens = torch.ones((args.batch_size, args.beam_size), dtype=torch.int32)
                    k = 0
                    for i in range(args.batch_size):
                        for j in range(args.beam_size):
                            cand = all_hyps[k]
                            l = len(cand)
                            hyps_pad[i][j][0:l] = torch.tensor(cand)
                            hyps_lens[i][j] = len(cand)
                            k += 1

                    bz = args.beam_size
                    B,T,F = encoder_out.shape
                    B2=B*bz

                    hyps_pad = hyps_pad.view(B2, max_len).cuda()
                    hyps_lens = hyps_lens.view(B2,).cuda()
                    hyps_pad[hyps_pad == speech2text.asr_model.ignore_id] = 0
                    nnlm_nll, x_lengths = full_lm_model.batchify_nll(hyps_pad, hyps_lens, 320)
                    
                    nnlm_scores = -nnlm_nll.sum(dim=1)

                    nnlm_scores = torch.reshape(nnlm_scores,(B,bz)).cpu()

                    all_scores = ctc_score + 0.9 * nnlm_scores # FIX ME need tuned
                    best_index = torch.argmax(all_scores, dim=1)

                    best_sents = []
                    k = 0
                    for idx in best_index:
                        cur_best_sent = all_hyps[k: k + args.beam_size][idx]
                        best_sents.append(cur_best_sent)
                        k += args.beam_size
                    hyps = map_batch(best_sents, speech2text.asr_model.token_list, num_processes)                    

                    count_time = time.time() - ll_time
                    count_times.append(count_time)
                    
                    # lm_count = time.time() - lm_time
                    # print("lm 耗时：", lm_count)
                    # lm_times.append(lm_count)
                    # print("*"*50)

                else:
                    raise NotImplementedError
                
             
            for i, key in enumerate(names):
                content = hyps[i]
                # print('{} {}'.format(key, content))
                fout.write('{} {}\n'.format(key, content))

    time_end = time.perf_counter() - time_start
    # encoder_times = encoder_times[5:]
    # ctc_times = ctc_times[5:]
    # beam_search_times = beam_search_times[5:]
    # lm_times = lm_times[5:]
    # mean_encoder = np.mean(encoder_times)
    # mean_ctc = np.mean(ctc_times)
    # mean_beam_search = np.mean(beam_search_times)
    # mean_lm = np.mean(lm_times)

    # print("平均 encode time:", mean_encoder)
    # print("平均 ctc time:", mean_ctc)
    # print("平均 beam_search time:", mean_beam_search)
    # print("平均 lm time:", mean_lm)
    count_times = count_times[5:]
    mean_count_time = np.mean(count_times)
    print("平均 mean_count_time:", mean_count_time, " fps: ", 24/mean_count_time)
#    if str(args.gpu) == '0':
    with open(args.log_file, 'w') as log:
        log.write(f"Decoding audio {audio_sample_len} secs, cost {time_end} secs, RTF: {time_end/audio_sample_len}, process {audio_sample_len/time_end} secs audio per second, decoding args: {args}")