#include #include #include #include #include #include #include "Audio.h" using namespace std; class AudioWindow { private: int *window; int in_idx; int out_idx; int sum; int window_size = 0; public: AudioWindow(int window_size) : window_size(window_size) { window = (int *)calloc(sizeof(int), window_size + 1); in_idx = 0; out_idx = 1; sum = 0; }; ~AudioWindow() { free(window); }; int put(int val) { sum = sum + val - window[out_idx]; window[in_idx] = val; in_idx = in_idx == window_size ? 0 : in_idx + 1; out_idx = out_idx == window_size ? 0 : out_idx + 1; return sum; }; }; AudioFrame::AudioFrame(){}; AudioFrame::AudioFrame(int len) : len(len) { start = 0; }; AudioFrame::~AudioFrame(){}; int AudioFrame::set_start(int val) { start = val < 0 ? 0 : val; return start; }; int AudioFrame::set_end(int val, int max_len) { float num_samples = val - start; float frame_length = 400; float frame_shift = 160; float num_new_samples = ceil((num_samples - 400) / frame_shift) * frame_shift + frame_length; end = start + num_new_samples; len = (int)num_new_samples; if (end > max_len) printf("frame end > max_len!!!!!!!\n"); return end; }; int AudioFrame::get_start() { return start; }; int AudioFrame::get_len() { return len; }; int AudioFrame::disp() { printf("not imp!!!!\n"); return 0; }; Audio::Audio(int data_type) : data_type(data_type) { speech_buff = NULL; speech_data = NULL; align_size = 1360; } Audio::Audio(int data_type, int size) : data_type(data_type) { speech_buff = NULL; speech_data = NULL; align_size = (float)size; } Audio::~Audio() { if (speech_buff != NULL) { free(speech_buff); free(speech_data); } } void Audio::disp() { printf("Audio time is %f s. len is %d\n", (float)speech_len / 16000, speech_len); } bool Audio::loadwav(const char *filename) { if (speech_buff != NULL) { free(speech_buff); free(speech_data); } offset = 0; FILE *fp; fp = fopen(filename, "rb"); if (fp == nullptr) return false; fseek(fp, 0, SEEK_END); uint32_t nFileLen = ftell(fp); fseek(fp, 44, SEEK_SET); speech_len = (nFileLen - 44) / 2; speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size); speech_buff = (int16_t *)malloc(sizeof(int16_t) * speech_align_len); memset(speech_buff, 0, sizeof(int16_t) * speech_align_len); int ret = fread(speech_buff, sizeof(int16_t), speech_len, fp); fclose(fp); speech_data = (float *)malloc(sizeof(float) * speech_align_len); memset(speech_data, 0, sizeof(float) * speech_align_len); int i; float scale = 1; if (data_type == 1) { scale = 32768; } for (i = 0; i < speech_len; i++) { speech_data[i] = (float)speech_buff[i] / scale; } AudioFrame *frame = new AudioFrame(speech_len); frame_queue.push(frame); return true; } int Audio::fetch_chunck(float *&dout, int len) { if (offset >= speech_align_len) { dout = NULL; return S_ERR; } else if (offset == speech_align_len - len) { dout = speech_data + offset; offset = speech_align_len; // 临时解决 AudioFrame *frame = frame_queue.front(); frame_queue.pop(); delete frame; return S_END; } else { dout = speech_data + offset; offset += len; return S_MIDDLE; } } int Audio::fetch(float *&dout, int &len, int &flag) { if (frame_queue.size() > 0) { AudioFrame *frame = frame_queue.front(); frame_queue.pop(); dout = speech_data + frame->get_start(); len = frame->get_len(); delete frame; flag = S_END; return 1; } else { return 0; } } void Audio::padding() { float num_samples = speech_len; float frame_length = 400; float frame_shift = 160; float num_frames = floor((num_samples + (frame_shift / 2)) / frame_shift); float num_new_samples = (num_frames - 1) * frame_shift + frame_length; float num_padding = num_new_samples - num_samples; float num_left_padding = (frame_length - frame_shift) / 2; float num_right_padding = num_padding - num_left_padding; float *new_data = (float *)malloc(num_new_samples * sizeof(float)); int i; int tmp_off = 0; for (i = 0; i < num_left_padding; i++) { int ii = num_left_padding - i - 1; new_data[i] = speech_data[ii]; } tmp_off = num_left_padding; memcpy(new_data + tmp_off, speech_data, speech_len * sizeof(float)); tmp_off += speech_len; for (i = 0; i < num_right_padding; i++) { int ii = speech_len - i - 1; new_data[tmp_off + i] = speech_data[ii]; } free(speech_data); speech_data = new_data; speech_len = num_new_samples; AudioFrame *frame = new AudioFrame(num_new_samples); frame_queue.push(frame); frame = frame_queue.front(); frame_queue.pop(); delete frame; } #define UNTRIGGERED 0 #define TRIGGERED 1 #define SPEECH_LEN_5S (16000 * 5) #define SPEECH_LEN_10S (16000 * 10) #define SPEECH_LEN_20S (16000 * 20) #define SPEECH_LEN_30S (16000 * 30) void Audio::split() { VadInst *handle = WebRtcVad_Create(); WebRtcVad_Init(handle); WebRtcVad_set_mode(handle, 2); int window_size = 10; AudioWindow audiowindow(window_size); int status = UNTRIGGERED; int offset = 0; int fs = 16000; int step = 480; AudioFrame *frame; frame = frame_queue.front(); frame_queue.pop(); delete frame; frame = NULL; while (offset < speech_len - step) { int n = WebRtcVad_Process(handle, fs, speech_buff + offset, step); if (status == UNTRIGGERED && audiowindow.put(n) >= window_size - 1) { frame = new AudioFrame(); int start = offset - step * (window_size - 1); frame->set_start(start); status = TRIGGERED; } else if (status == TRIGGERED) { int win_weight = audiowindow.put(n); int voice_len = (offset - frame->get_start()); int gap = 0; if (voice_len < SPEECH_LEN_5S) { offset += step; continue; } else if (voice_len < SPEECH_LEN_10S) { gap = 1; } else if (voice_len < SPEECH_LEN_20S) { gap = window_size / 5; } else { gap = window_size / 2; } if (win_weight < gap) { status = UNTRIGGERED; offset = frame->set_end(offset, speech_align_len); frame_queue.push(frame); frame = NULL; } } offset += step; } if (frame != NULL) { frame->set_end(speech_len, speech_align_len); frame_queue.push(frame); frame = NULL; } WebRtcVad_Free(handle); }