import numpy as np
import librosa
from scipy.spatial.distance import euclidean
from jiwer import wer

def extract_mfccs(signal, sample_rate):
    mfccs = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=13)
    return mfccs

def calculate_mcd(original_mfccs, converted_mfccs):
    distances = [euclidean(original_mfccs[:, k], converted_mfccs[:, k]) for k in range(original_mfccs.shape[1])]
    mcd = np.sqrt(np.mean([d**2 for d in distances]))
    return mcd

def calculate_wer(reference_text, hypothesis_text):
    return wer(reference_text, hypothesis_text)

# 示例音频文件
original_audio_file = 'original.wav'
converted_audio_file = 'converted.wav'

# 加载音频
original_signal, sample_rate = librosa.load(original_audio_file, sr=None)
converted_signal, _ = librosa.load(converted_audio_file, sr=None)

# 提取MFCCs
original_mfccs = extract_mfccs(original_signal, sample_rate)
converted_mfccs = extract_mfccs(converted_signal, sample_rate)

# 计算MCD
mcd = calculate_mcd(original_mfccs, converted_mfccs)
print(f"MCD: {mcd}")

# 假设我们有参考文本和转换后的文本
reference_text = "This is a reference text."
hypothesis_text = "This is a reference text."

# 计算WER
wer_value = calculate_wer(reference_text, hypothesis_text)
print(f"WER: {wer_value * 100}%")